howard.objects.variants

    1import csv
    2import gc
    3import gzip
    4import io
    5import multiprocessing as mp
    6import os
    7import random
    8import re
    9import shlex
   10import sqlite3
   11import subprocess
   12from tempfile import NamedTemporaryFile, TemporaryDirectory
   13import tempfile
   14import duckdb
   15import json
   16import yaml
   17import argparse
   18import Bio.bgzf as bgzf
   19import pandas as pd
   20from pyfaidx import Fasta
   21import numpy as np
   22import vcf
   23import logging as log
   24import fastparquet as fp
   25from multiprocesspandas import applyparallel
   26import cyvcf2
   27import pyBigWig
   28import math
   29
   30from howard.functions.commons import *
   31from howard.objects.database import *
   32from howard.functions.databases import *
   33from howard.functions.utils import *
   34
   35
   36class Variants:
   37
   38    def __init__(
   39        self,
   40        conn=None,
   41        input: str = None,
   42        output: str = None,
   43        config: dict = {},
   44        param: dict = {},
   45        load: bool = False,
   46    ) -> None:
   47        """
   48        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
   49        header
   50
   51        :param conn: the connection to the database
   52        :param input: the input file
   53        :param output: the output file
   54        :param config: a dictionary containing the configuration of the model
   55        :param param: a dictionary containing the parameters of the model
   56        """
   57
   58        # Init variables
   59        self.init_variables()
   60
   61        # Input
   62        self.set_input(input)
   63
   64        # Config
   65        self.set_config(config)
   66
   67        # Param
   68        self.set_param(param)
   69
   70        # Output
   71        self.set_output(output)
   72
   73        # connexion
   74        self.set_connexion(conn)
   75
   76        # Header
   77        self.set_header()
   78
   79        # Samples
   80        self.set_samples()
   81
   82        # Load data
   83        if load:
   84            self.load_data()
   85
   86    def set_samples(self, samples: list = None) -> list:
   87        """
   88        The function `set_samples` sets the samples attribute of an object to a provided list or
   89        retrieves it from a parameter dictionary.
   90
   91        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
   92        input and sets the `samples` attribute of the class to the provided list. If no samples are
   93        provided, it tries to get the samples from the class's parameters using the `get_param` method
   94        :type samples: list
   95        :return: The `samples` list is being returned.
   96        """
   97
   98        if not samples:
   99            samples = self.get_param().get("samples", {}).get("list", None)
  100
  101        self.samples = samples
  102
  103        return samples
  104
  105    def get_samples(self) -> list:
  106        """
  107        This function returns a list of samples.
  108        :return: The `get_samples` method is returning the `samples` attribute of the object.
  109        """
  110
  111        return self.samples
  112
  113    def get_samples_check(self) -> bool:
  114        """
  115        This function returns the value of the "check" key within the "samples" dictionary retrieved
  116        from the parameters.
  117        :return: The method `get_samples_check` is returning the value of the key "check" inside the
  118        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
  119        method. If the key "check" is not found, it will return `False`.
  120        """
  121
  122        return self.get_param().get("samples", {}).get("check", True)
  123
  124    def set_input(self, input: str = None) -> None:
  125        """
  126        The function `set_input` takes a file name as input, extracts the name and extension, and sets
  127        attributes in the class accordingly.
  128
  129        :param input: The `set_input` method in the provided code snippet is used to set attributes
  130        related to the input file. Here's a breakdown of the parameters and their usage in the method:
  131        :type input: str
  132        """
  133
  134        if input and not isinstance(input, str):
  135            try:
  136                self.input = input.name
  137            except:
  138                log.error(f"Input file '{input} in bad format")
  139                raise ValueError(f"Input file '{input} in bad format")
  140        else:
  141            self.input = input
  142
  143        # Input format
  144        if input:
  145            input_name, input_extension = os.path.splitext(self.input)
  146            self.input_name = input_name
  147            self.input_extension = input_extension
  148            self.input_format = self.input_extension.replace(".", "")
  149
  150    def set_config(self, config: dict) -> None:
  151        """
  152        The set_config function takes a config object and assigns it as the configuration object for the
  153        class.
  154
  155        :param config: The `config` parameter in the `set_config` function is a dictionary object that
  156        contains configuration settings for the class. When you call the `set_config` function with a
  157        dictionary object as the argument, it will set that dictionary as the configuration object for
  158        the class
  159        :type config: dict
  160        """
  161
  162        self.config = config
  163
  164    def set_param(self, param: dict) -> None:
  165        """
  166        This function sets a parameter object for the class based on the input dictionary.
  167
  168        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
  169        as the `param` attribute of the class instance
  170        :type param: dict
  171        """
  172
  173        self.param = param
  174
  175    def init_variables(self) -> None:
  176        """
  177        This function initializes the variables that will be used in the rest of the class
  178        """
  179
  180        self.prefix = "howard"
  181        self.table_variants = "variants"
  182        self.dataframe = None
  183
  184        self.comparison_map = {
  185            "gt": ">",
  186            "gte": ">=",
  187            "lt": "<",
  188            "lte": "<=",
  189            "equals": "=",
  190            "contains": "SIMILAR TO",
  191        }
  192
  193        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
  194
  195        self.code_type_map_to_sql = {
  196            "Integer": "INTEGER",
  197            "String": "VARCHAR",
  198            "Float": "FLOAT",
  199            "Flag": "VARCHAR",
  200        }
  201
  202        self.index_additionnal_fields = []
  203
  204    def get_indexing(self) -> bool:
  205        """
  206        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
  207        returns False.
  208        :return: The value of the indexing parameter.
  209        """
  210
  211        return self.get_param().get("indexing", False)
  212
  213    def get_connexion_config(self) -> dict:
  214        """
  215        The function `get_connexion_config` returns a dictionary containing the configuration for a
  216        connection, including the number of threads and memory limit.
  217        :return: a dictionary containing the configuration for the Connexion library.
  218        """
  219
  220        # config
  221        config = self.get_config()
  222
  223        # Connexion config
  224        connexion_config = {}
  225        threads = self.get_threads()
  226
  227        # Threads
  228        if threads:
  229            connexion_config["threads"] = threads
  230
  231        # Memory
  232        # if config.get("memory", None):
  233        #     connexion_config["memory_limit"] = config.get("memory")
  234        if self.get_memory():
  235            connexion_config["memory_limit"] = self.get_memory()
  236
  237        # Temporary directory
  238        if config.get("tmp", None):
  239            connexion_config["temp_directory"] = config.get("tmp")
  240
  241        # Access
  242        if config.get("access", None):
  243            access = config.get("access")
  244            if access in ["RO"]:
  245                access = "READ_ONLY"
  246            elif access in ["RW"]:
  247                access = "READ_WRITE"
  248            connexion_db = self.get_connexion_db()
  249            if connexion_db in ":memory:":
  250                access = "READ_WRITE"
  251            connexion_config["access_mode"] = access
  252
  253        return connexion_config
  254
  255    def get_duckdb_settings(self) -> dict:
  256        """
  257        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
  258        string.
  259        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
  260        """
  261
  262        # config
  263        config = self.get_config()
  264
  265        # duckdb settings
  266        duckdb_settings_dict = {}
  267        if config.get("duckdb_settings", None):
  268            duckdb_settings = config.get("duckdb_settings")
  269            duckdb_settings = full_path(duckdb_settings)
  270            # duckdb setting is a file
  271            if os.path.exists(duckdb_settings):
  272                with open(duckdb_settings) as json_file:
  273                    duckdb_settings_dict = yaml.safe_load(json_file)
  274            # duckdb settings is a string
  275            else:
  276                duckdb_settings_dict = json.loads(duckdb_settings)
  277
  278        return duckdb_settings_dict
  279
  280    def set_connexion_db(self) -> str:
  281        """
  282        The function `set_connexion_db` returns the appropriate database connection string based on the
  283        input format and connection type.
  284        :return: the value of the variable `connexion_db`.
  285        """
  286
  287        # Default connexion db
  288        default_connexion_db = ":memory:"
  289
  290        # Find connexion db
  291        if self.get_input_format() in ["db", "duckdb"]:
  292            connexion_db = self.get_input()
  293        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
  294            connexion_db = default_connexion_db
  295        elif self.get_connexion_type() in ["tmpfile"]:
  296            tmp_name = tempfile.mkdtemp(
  297                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
  298            )
  299            connexion_db = f"{tmp_name}/tmp.db"
  300        elif self.get_connexion_type() != "":
  301            connexion_db = self.get_connexion_type()
  302        else:
  303            connexion_db = default_connexion_db
  304
  305        # Set connexion db
  306        self.connexion_db = connexion_db
  307
  308        return connexion_db
  309
  310    def set_connexion(self, conn) -> None:
  311        """
  312        The function `set_connexion` creates a connection to a database, with options for different
  313        database formats and settings.
  314
  315        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
  316        database. If a connection is not provided, a new connection to an in-memory database is created.
  317        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
  318        sqlite
  319        """
  320
  321        # Connexion db
  322        connexion_db = self.set_connexion_db()
  323
  324        # Connexion config
  325        connexion_config = self.get_connexion_config()
  326
  327        # Connexion format
  328        connexion_format = self.get_config().get("connexion_format", "duckdb")
  329        # Set connexion format
  330        self.connexion_format = connexion_format
  331
  332        # Connexion
  333        if not conn:
  334            if connexion_format in ["duckdb"]:
  335                conn = duckdb.connect(connexion_db, config=connexion_config)
  336                # duckDB settings
  337                duckdb_settings = self.get_duckdb_settings()
  338                if duckdb_settings:
  339                    for setting in duckdb_settings:
  340                        setting_value = duckdb_settings.get(setting)
  341                        if isinstance(setting_value, str):
  342                            setting_value = f"'{setting_value}'"
  343                        conn.execute(f"PRAGMA {setting}={setting_value};")
  344            elif connexion_format in ["sqlite"]:
  345                conn = sqlite3.connect(connexion_db)
  346
  347        # Set connexion
  348        self.conn = conn
  349
  350        # Log
  351        log.debug(f"connexion_format: {connexion_format}")
  352        log.debug(f"connexion_db: {connexion_db}")
  353        log.debug(f"connexion config: {connexion_config}")
  354        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
  355
  356    def set_output(self, output: str = None) -> None:
  357        """
  358        The `set_output` function in Python sets the output file based on the input or a specified key
  359        in the config file, extracting the output name, extension, and format.
  360
  361        :param output: The `output` parameter in the `set_output` method is used to specify the name of
  362        the output file. If the config file has an 'output' key, the method sets the output to the value
  363        of that key. If no output is provided, it sets the output to `None`
  364        :type output: str
  365        """
  366
  367        if output and not isinstance(output, str):
  368            self.output = output.name
  369        else:
  370            self.output = output
  371
  372        # Output format
  373        if self.output:
  374            output_name, output_extension = os.path.splitext(self.output)
  375            self.output_name = output_name
  376            self.output_extension = output_extension
  377            self.output_format = self.output_extension.replace(".", "")
  378        else:
  379            self.output_name = None
  380            self.output_extension = None
  381            self.output_format = None
  382
  383    def set_header(self) -> None:
  384        """
  385        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
  386        """
  387
  388        input_file = self.get_input()
  389        default_header_list = [
  390            "##fileformat=VCFv4.2",
  391            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
  392        ]
  393
  394        # Full path
  395        input_file = full_path(input_file)
  396
  397        if input_file:
  398
  399            input_format = self.get_input_format()
  400            input_compressed = self.get_input_compressed()
  401            config = self.get_config()
  402            header_list = default_header_list
  403            if input_format in [
  404                "vcf",
  405                "hdr",
  406                "tsv",
  407                "csv",
  408                "psv",
  409                "parquet",
  410                "db",
  411                "duckdb",
  412            ]:
  413                # header provided in param
  414                if config.get("header_file", None):
  415                    with open(config.get("header_file"), "rt") as f:
  416                        header_list = self.read_vcf_header(f)
  417                # within a vcf file format (header within input file itsself)
  418                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
  419                    # within a compressed vcf file format (.vcf.gz)
  420                    if input_compressed:
  421                        with bgzf.open(input_file, "rt") as f:
  422                            header_list = self.read_vcf_header(f)
  423                    # within an uncompressed vcf file format (.vcf)
  424                    else:
  425                        with open(input_file, "rt") as f:
  426                            header_list = self.read_vcf_header(f)
  427                # header provided in default external file .hdr
  428                elif os.path.exists((input_file + ".hdr")):
  429                    with open(input_file + ".hdr", "rt") as f:
  430                        header_list = self.read_vcf_header(f)
  431                else:
  432                    try:  # Try to get header info fields and file columns
  433
  434                        with tempfile.TemporaryDirectory() as tmpdir:
  435
  436                            # Create database
  437                            db_for_header = Database(database=input_file)
  438
  439                            # Get header columns for infos fields
  440                            db_header_from_columns = (
  441                                db_for_header.get_header_from_columns()
  442                            )
  443
  444                            # Get real columns in the file
  445                            db_header_columns = db_for_header.get_columns()
  446
  447                            # Write header file
  448                            header_file_tmp = os.path.join(tmpdir, "header")
  449                            f = open(header_file_tmp, "w")
  450                            vcf.Writer(f, db_header_from_columns)
  451                            f.close()
  452
  453                            # Replace #CHROM line with rel columns
  454                            header_list = db_for_header.read_header_file(
  455                                header_file=header_file_tmp
  456                            )
  457                            header_list[-1] = "\t".join(db_header_columns)
  458
  459                    except:
  460
  461                        log.warning(
  462                            f"No header for file {input_file}. Set as default VCF header"
  463                        )
  464                        header_list = default_header_list
  465
  466            else:  # try for unknown format ?
  467
  468                log.error(f"Input file format '{input_format}' not available")
  469                raise ValueError(f"Input file format '{input_format}' not available")
  470
  471            if not header_list:
  472                header_list = default_header_list
  473
  474            # header as list
  475            self.header_list = header_list
  476
  477            # header as VCF object
  478            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
  479
  480        else:
  481
  482            self.header_list = None
  483            self.header_vcf = None
  484
  485    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
  486        """
  487        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
  488        DataFrame based on the connection format.
  489
  490        :param query: The `query` parameter in the `get_query_to_df` function is a string that
  491        represents the SQL query you want to execute. This query will be used to fetch data from a
  492        database and convert it into a pandas DataFrame
  493        :type query: str
  494        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
  495        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
  496        function will only fetch up to that number of rows from the database query result. If no limit
  497        is specified,
  498        :type limit: int
  499        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
  500        """
  501
  502        # Connexion format
  503        connexion_format = self.get_connexion_format()
  504
  505        # Limit in query
  506        if limit:
  507            pd.set_option("display.max_rows", limit)
  508            if connexion_format in ["duckdb"]:
  509                df = (
  510                    self.conn.execute(query)
  511                    .fetch_record_batch(limit)
  512                    .read_next_batch()
  513                    .to_pandas()
  514                )
  515            elif connexion_format in ["sqlite"]:
  516                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
  517
  518        # Full query
  519        else:
  520            if connexion_format in ["duckdb"]:
  521                df = self.conn.execute(query).df()
  522            elif connexion_format in ["sqlite"]:
  523                df = pd.read_sql_query(query, self.conn)
  524
  525        return df
  526
  527    def get_overview(self) -> None:
  528        """
  529        The function prints the input, output, config, and dataframe of the current object
  530        """
  531        table_variants_from = self.get_table_variants(clause="from")
  532        sql_columns = self.get_header_columns_as_sql()
  533        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
  534        df = self.get_query_to_df(sql_query_export)
  535        log.info(
  536            "Input:  "
  537            + str(self.get_input())
  538            + " ["
  539            + str(str(self.get_input_format()))
  540            + "]"
  541        )
  542        log.info(
  543            "Output: "
  544            + str(self.get_output())
  545            + " ["
  546            + str(str(self.get_output_format()))
  547            + "]"
  548        )
  549        log.info("Config: ")
  550        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
  551            "\n"
  552        ):
  553            log.info("\t" + str(d))
  554        log.info("Param: ")
  555        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
  556            "\n"
  557        ):
  558            log.info("\t" + str(d))
  559        log.info("Sample list: " + str(self.get_header_sample_list()))
  560        log.info("Dataframe: ")
  561        for d in str(df).split("\n"):
  562            log.info("\t" + str(d))
  563
  564        # garbage collector
  565        del df
  566        gc.collect()
  567
  568        return None
  569
  570    def get_stats(self) -> dict:
  571        """
  572        The `get_stats` function calculates and returns various statistics of the current object,
  573        including information about the input file, variants, samples, header fields, quality, and
  574        SNVs/InDels.
  575        :return: a dictionary containing various statistics of the current object. The dictionary has
  576        the following structure:
  577        """
  578
  579        # Log
  580        log.info(f"Stats Calculation...")
  581
  582        # table varaints
  583        table_variants_from = self.get_table_variants()
  584
  585        # stats dict
  586        stats = {"Infos": {}}
  587
  588        ### File
  589        input_file = self.get_input()
  590        stats["Infos"]["Input file"] = input_file
  591
  592        # Header
  593        header_infos = self.get_header().infos
  594        header_formats = self.get_header().formats
  595        header_infos_list = list(header_infos)
  596        header_formats_list = list(header_formats)
  597
  598        ### Variants
  599
  600        stats["Variants"] = {}
  601
  602        # Variants by chr
  603        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
  604        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
  605        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
  606            by=["CHROM"], kind="quicksort"
  607        )
  608
  609        # Total number of variants
  610        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
  611
  612        # Calculate percentage
  613        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
  614            lambda x: (x / nb_of_variants)
  615        )
  616
  617        stats["Variants"]["Number of variants by chromosome"] = (
  618            nb_of_variants_by_chrom.to_dict(orient="index")
  619        )
  620
  621        stats["Infos"]["Number of variants"] = int(nb_of_variants)
  622
  623        ### Samples
  624
  625        # Init
  626        samples = {}
  627        nb_of_samples = 0
  628
  629        # Check Samples
  630        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
  631            log.debug(f"Check samples...")
  632            for sample in self.get_header_sample_list():
  633                sql_query_samples = f"""
  634                    SELECT  '{sample}' as sample,
  635                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
  636                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
  637                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
  638                    FROM {table_variants_from}
  639                    WHERE (
  640                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
  641                        AND
  642                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
  643                      )
  644                    GROUP BY genotype
  645                    """
  646                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
  647                sample_genotype_count = sql_query_genotype_df["count"].sum()
  648                if len(sql_query_genotype_df):
  649                    nb_of_samples += 1
  650                    samples[f"{sample} - {sample_genotype_count} variants"] = (
  651                        sql_query_genotype_df.to_dict(orient="index")
  652                    )
  653
  654            stats["Samples"] = samples
  655            stats["Infos"]["Number of samples"] = nb_of_samples
  656
  657        # #
  658        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
  659        #     stats["Infos"]["Number of samples"] = nb_of_samples
  660        # elif nb_of_samples:
  661        #     stats["Infos"]["Number of samples"] = "not a VCF format"
  662
  663        ### INFO and FORMAT fields
  664        header_types_df = {}
  665        header_types_list = {
  666            "List of INFO fields": header_infos,
  667            "List of FORMAT fields": header_formats,
  668        }
  669        i = 0
  670        for header_type in header_types_list:
  671
  672            header_type_infos = header_types_list.get(header_type)
  673            header_infos_dict = {}
  674
  675            for info in header_type_infos:
  676
  677                i += 1
  678                header_infos_dict[i] = {}
  679
  680                # ID
  681                header_infos_dict[i]["id"] = info
  682
  683                # num
  684                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
  685                if header_type_infos[info].num in genotype_map.keys():
  686                    header_infos_dict[i]["Number"] = genotype_map.get(
  687                        header_type_infos[info].num
  688                    )
  689                else:
  690                    header_infos_dict[i]["Number"] = header_type_infos[info].num
  691
  692                # type
  693                if header_type_infos[info].type:
  694                    header_infos_dict[i]["Type"] = header_type_infos[info].type
  695                else:
  696                    header_infos_dict[i]["Type"] = "."
  697
  698                # desc
  699                if header_type_infos[info].desc != None:
  700                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
  701                else:
  702                    header_infos_dict[i]["Description"] = ""
  703
  704            if len(header_infos_dict):
  705                header_types_df[header_type] = pd.DataFrame.from_dict(
  706                    header_infos_dict, orient="index"
  707                ).to_dict(orient="index")
  708
  709        # Stats
  710        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
  711        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
  712        stats["Header"] = header_types_df
  713
  714        ### QUAL
  715        if "QUAL" in self.get_header_columns():
  716            sql_query_qual = f"""
  717                    SELECT
  718                        avg(CAST(QUAL AS INTEGER)) AS Average,
  719                        min(CAST(QUAL AS INTEGER)) AS Minimum,
  720                        max(CAST(QUAL AS INTEGER)) AS Maximum,
  721                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
  722                        median(CAST(QUAL AS INTEGER)) AS Median,
  723                        variance(CAST(QUAL AS INTEGER)) AS Variance
  724                    FROM {table_variants_from}
  725                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
  726                    """
  727
  728            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
  729            stats["Quality"] = {"Stats": qual}
  730
  731        ### SNV and InDel
  732
  733        sql_query_snv = f"""
  734            
  735            SELECT Type, count FROM (
  736
  737                    SELECT
  738                        'Total' AS Type,
  739                        count(*) AS count
  740                    FROM {table_variants_from}
  741
  742                    UNION
  743
  744                    SELECT
  745                        'MNV' AS Type,
  746                        count(*) AS count
  747                    FROM {table_variants_from}
  748                    WHERE len(REF) > 1 AND len(ALT) > 1
  749                    AND len(REF) = len(ALT)
  750
  751                    UNION
  752
  753                    SELECT
  754                        'InDel' AS Type,
  755                        count(*) AS count
  756                    FROM {table_variants_from}
  757                    WHERE len(REF) > 1 OR len(ALT) > 1
  758                    AND len(REF) != len(ALT)
  759                    
  760                    UNION
  761
  762                    SELECT
  763                        'SNV' AS Type,
  764                        count(*) AS count
  765                    FROM {table_variants_from}
  766                    WHERE len(REF) = 1 AND len(ALT) = 1
  767
  768                )
  769
  770            ORDER BY count DESC
  771
  772                """
  773        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
  774
  775        sql_query_snv_substitution = f"""
  776                SELECT
  777                    concat(REF, '>', ALT) AS 'Substitution',
  778                    count(*) AS count
  779                FROM {table_variants_from}
  780                WHERE len(REF) = 1 AND len(ALT) = 1
  781                GROUP BY REF, ALT
  782                ORDER BY count(*) DESC
  783                """
  784        snv_substitution = (
  785            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
  786        )
  787        stats["Variants"]["Counts"] = snv_indel
  788        stats["Variants"]["Substitutions"] = snv_substitution
  789
  790        return stats
  791
  792    def stats_to_file(self, file: str = None) -> str:
  793        """
  794        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
  795        into a JSON object, and writes the JSON object to the specified file.
  796
  797        :param file: The `file` parameter is a string that represents the file path where the JSON data
  798        will be written
  799        :type file: str
  800        :return: the name of the file that was written to.
  801        """
  802
  803        # Get stats
  804        stats = self.get_stats()
  805
  806        # Serializing json
  807        json_object = json.dumps(stats, indent=4)
  808
  809        # Writing to sample.json
  810        with open(file, "w") as outfile:
  811            outfile.write(json_object)
  812
  813        return file
  814
  815    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
  816        """
  817        The `print_stats` function generates a markdown file and prints the statistics contained in a
  818        JSON file in a formatted manner.
  819
  820        :param output_file: The `output_file` parameter is a string that specifies the path and filename
  821        of the output file where the stats will be printed in Markdown format. If no `output_file` is
  822        provided, a temporary directory will be created and the stats will be saved in a file named
  823        "stats.md" within that
  824        :type output_file: str
  825        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
  826        file where the statistics will be saved. If no value is provided, a temporary directory will be
  827        created and a default file name "stats.json" will be used
  828        :type json_file: str
  829        :return: The function `print_stats` does not return any value. It has a return type annotation
  830        of `None`.
  831        """
  832
  833        # Full path
  834        output_file = full_path(output_file)
  835        json_file = full_path(json_file)
  836
  837        with tempfile.TemporaryDirectory() as tmpdir:
  838
  839            # Files
  840            if not output_file:
  841                output_file = os.path.join(tmpdir, "stats.md")
  842            if not json_file:
  843                json_file = os.path.join(tmpdir, "stats.json")
  844
  845            # Create folders
  846            if not os.path.exists(os.path.dirname(output_file)):
  847                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
  848            if not os.path.exists(os.path.dirname(json_file)):
  849                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
  850
  851            # Create stats JSON file
  852            stats_file = self.stats_to_file(file=json_file)
  853
  854            # Print stats file
  855            with open(stats_file) as f:
  856                stats = yaml.safe_load(f)
  857
  858            # Output
  859            output_title = []
  860            output_index = []
  861            output = []
  862
  863            # Title
  864            output_title.append("# HOWARD Stats")
  865
  866            # Index
  867            output_index.append("## Index")
  868
  869            # Process sections
  870            for section in stats:
  871                infos = stats.get(section)
  872                section_link = "#" + section.lower().replace(" ", "-")
  873                output.append(f"## {section}")
  874                output_index.append(f"- [{section}]({section_link})")
  875
  876                if len(infos):
  877                    for info in infos:
  878                        try:
  879                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
  880                            is_df = True
  881                        except:
  882                            try:
  883                                df = pd.DataFrame.from_dict(
  884                                    json.loads((infos.get(info))), orient="index"
  885                                )
  886                                is_df = True
  887                            except:
  888                                is_df = False
  889                        if is_df:
  890                            output.append(f"### {info}")
  891                            info_link = "#" + info.lower().replace(" ", "-")
  892                            output_index.append(f"   - [{info}]({info_link})")
  893                            output.append(f"{df.to_markdown(index=False)}")
  894                        else:
  895                            output.append(f"- {info}: {infos.get(info)}")
  896                else:
  897                    output.append(f"NA")
  898
  899            # Write stats in markdown file
  900            with open(output_file, "w") as fp:
  901                for item in output_title:
  902                    fp.write("%s\n" % item)
  903                for item in output_index:
  904                    fp.write("%s\n" % item)
  905                for item in output:
  906                    fp.write("%s\n" % item)
  907
  908            # Output stats in markdown
  909            print("")
  910            print("\n\n".join(output_title))
  911            print("")
  912            print("\n\n".join(output))
  913            print("")
  914
  915        return None
  916
  917    def get_input(self) -> str:
  918        """
  919        It returns the value of the input variable.
  920        :return: The input is being returned.
  921        """
  922        return self.input
  923
  924    def get_input_format(self, input_file: str = None) -> str:
  925        """
  926        This function returns the format of the input variable, either from the provided input file or
  927        by prompting for input.
  928
  929        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
  930        represents the file path of the input file. If no `input_file` is provided when calling the
  931        method, it will default to `None`
  932        :type input_file: str
  933        :return: The format of the input variable is being returned.
  934        """
  935
  936        if not input_file:
  937            input_file = self.get_input()
  938        input_format = get_file_format(input_file)
  939        return input_format
  940
  941    def get_input_compressed(self, input_file: str = None) -> str:
  942        """
  943        The function `get_input_compressed` returns the format of the input variable after compressing
  944        it.
  945
  946        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
  947        that represents the file path of the input file. If no `input_file` is provided when calling the
  948        method, it will default to `None` and the method will then call `self.get_input()` to
  949        :type input_file: str
  950        :return: The function `get_input_compressed` returns the compressed format of the input
  951        variable.
  952        """
  953
  954        if not input_file:
  955            input_file = self.get_input()
  956        input_compressed = get_file_compressed(input_file)
  957        return input_compressed
  958
  959    def get_output(self) -> str:
  960        """
  961        It returns the output of the neuron.
  962        :return: The output of the neural network.
  963        """
  964
  965        return self.output
  966
  967    def get_output_format(self, output_file: str = None) -> str:
  968        """
  969        The function `get_output_format` returns the format of the input variable or the output file if
  970        provided.
  971
  972        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
  973        that represents the file path of the output file. If no `output_file` is provided when calling
  974        the method, it will default to the output obtained from the `get_output` method of the class
  975        instance. The
  976        :type output_file: str
  977        :return: The format of the input variable is being returned.
  978        """
  979
  980        if not output_file:
  981            output_file = self.get_output()
  982        output_format = get_file_format(output_file)
  983
  984        return output_format
  985
  986    def get_config(self) -> dict:
  987        """
  988        It returns the config
  989        :return: The config variable is being returned.
  990        """
  991        return self.config
  992
  993    def get_param(self) -> dict:
  994        """
  995        It returns the param
  996        :return: The param variable is being returned.
  997        """
  998        return self.param
  999
 1000    def get_connexion_db(self) -> str:
 1001        """
 1002        It returns the connexion_db attribute of the object
 1003        :return: The connexion_db is being returned.
 1004        """
 1005        return self.connexion_db
 1006
 1007    def get_prefix(self) -> str:
 1008        """
 1009        It returns the prefix of the object.
 1010        :return: The prefix is being returned.
 1011        """
 1012        return self.prefix
 1013
 1014    def get_table_variants(self, clause: str = "select") -> str:
 1015        """
 1016        This function returns the table_variants attribute of the object
 1017
 1018        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
 1019        defaults to select (optional)
 1020        :return: The table_variants attribute of the object.
 1021        """
 1022
 1023        # Access
 1024        access = self.get_config().get("access", None)
 1025
 1026        # Clauses "select", "where", "update"
 1027        if clause in ["select", "where", "update"]:
 1028            table_variants = self.table_variants
 1029        # Clause "from"
 1030        elif clause in ["from"]:
 1031            # For Read Only
 1032            if self.get_input_format() in ["parquet"] and access in ["RO"]:
 1033                input_file = self.get_input()
 1034                table_variants = f"'{input_file}' as variants"
 1035            # For Read Write
 1036            else:
 1037                table_variants = f"{self.table_variants} as variants"
 1038        else:
 1039            table_variants = self.table_variants
 1040        return table_variants
 1041
 1042    def get_tmp_dir(self) -> str:
 1043        """
 1044        The function `get_tmp_dir` returns the temporary directory path based on configuration
 1045        parameters or a default path.
 1046        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
 1047        configuration, parameters, and a default value of "/tmp".
 1048        """
 1049
 1050        return get_tmp(
 1051            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
 1052        )
 1053
 1054    def get_connexion_type(self) -> str:
 1055        """
 1056        If the connexion type is not in the list of allowed connexion types, raise a ValueError
 1057
 1058        :return: The connexion type is being returned.
 1059        """
 1060        return self.get_config().get("connexion_type", "memory")
 1061
 1062    def get_connexion(self):
 1063        """
 1064        It returns the connection object
 1065
 1066        :return: The connection object.
 1067        """
 1068        return self.conn
 1069
 1070    def close_connexion(self) -> None:
 1071        """
 1072        This function closes the connection to the database.
 1073        :return: The connection is being closed.
 1074        """
 1075        return self.conn.close()
 1076
 1077    def get_header(self, type: str = "vcf"):
 1078        """
 1079        This function returns the header of the VCF file as a list of strings
 1080
 1081        :param type: the type of header you want to get, defaults to vcf (optional)
 1082        :return: The header of the vcf file.
 1083        """
 1084
 1085        if self.header_vcf:
 1086            if type == "vcf":
 1087                return self.header_vcf
 1088            elif type == "list":
 1089                return self.header_list
 1090        else:
 1091            if type == "vcf":
 1092                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 1093                return header
 1094            elif type == "list":
 1095                return vcf_required
 1096
 1097    def get_header_infos_list(self) -> list:
 1098        """
 1099        This function retrieves a list of information fields from the header.
 1100        :return: A list of information fields from the header.
 1101        """
 1102
 1103        # Init
 1104        infos_list = []
 1105
 1106        for field in self.get_header().infos:
 1107            infos_list.append(field)
 1108
 1109        return infos_list
 1110
 1111    def get_header_length(self, file: str = None) -> int:
 1112        """
 1113        The function `get_header_length` returns the length of the header list, excluding the #CHROM
 1114        line.
 1115
 1116        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
 1117        header file. If this argument is provided, the function will read the header from the specified
 1118        file and return the length of the header list minus 1 (to exclude the #CHROM line)
 1119        :type file: str
 1120        :return: the length of the header list, excluding the #CHROM line.
 1121        """
 1122
 1123        if file:
 1124            return len(self.read_vcf_header_file(file=file)) - 1
 1125        elif self.get_header(type="list"):
 1126            return len(self.get_header(type="list")) - 1
 1127        else:
 1128            return 0
 1129
 1130    def get_header_columns(self) -> str:
 1131        """
 1132        This function returns the header list of a VCF
 1133
 1134        :return: The length of the header list.
 1135        """
 1136        if self.get_header():
 1137            return self.get_header(type="list")[-1]
 1138        else:
 1139            return ""
 1140
 1141    def get_header_columns_as_list(self) -> list:
 1142        """
 1143        This function returns the header list of a VCF
 1144
 1145        :return: The length of the header list.
 1146        """
 1147        if self.get_header():
 1148            return self.get_header_columns().strip().split("\t")
 1149        else:
 1150            return []
 1151
 1152    def get_header_columns_as_sql(self) -> str:
 1153        """
 1154        This function retruns header length (without #CHROM line)
 1155
 1156        :return: The length of the header list.
 1157        """
 1158        sql_column_list = []
 1159        for col in self.get_header_columns_as_list():
 1160            sql_column_list.append(f'"{col}"')
 1161        return ",".join(sql_column_list)
 1162
 1163    def get_header_sample_list(
 1164        self, check: bool = False, samples: list = None, samples_force: bool = False
 1165    ) -> list:
 1166        """
 1167        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
 1168        checking and filtering based on input parameters.
 1169
 1170        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
 1171        parameter that determines whether to check if the samples in the list are properly defined as
 1172        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
 1173        list is defined as a, defaults to False
 1174        :type check: bool (optional)
 1175        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
 1176        allows you to specify a subset of samples from the header. If you provide a list of sample
 1177        names, the function will check if each sample is defined in the header. If a sample is not found
 1178        in the
 1179        :type samples: list
 1180        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
 1181        a boolean parameter that determines whether to force the function to return the sample list
 1182        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
 1183        function will return the sample list without performing, defaults to False
 1184        :type samples_force: bool (optional)
 1185        :return: The function `get_header_sample_list` returns a list of samples based on the input
 1186        parameters and conditions specified in the function.
 1187        """
 1188
 1189        # Init
 1190        samples_list = []
 1191
 1192        if samples is None:
 1193            samples_list = self.header_vcf.samples
 1194        else:
 1195            samples_checked = []
 1196            for sample in samples:
 1197                if sample in self.header_vcf.samples:
 1198                    samples_checked.append(sample)
 1199                else:
 1200                    log.warning(f"Sample '{sample}' not defined in header")
 1201            samples_list = samples_checked
 1202
 1203            # Force sample list without checking if is_genotype_column
 1204            if samples_force:
 1205                log.warning(f"Samples {samples_list} not checked if genotypes")
 1206                return samples_list
 1207
 1208        if check:
 1209            samples_checked = []
 1210            for sample in samples_list:
 1211                if self.is_genotype_column(column=sample):
 1212                    samples_checked.append(sample)
 1213                else:
 1214                    log.warning(
 1215                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
 1216                    )
 1217            samples_list = samples_checked
 1218
 1219        # Return samples list
 1220        return samples_list
 1221
 1222    def is_genotype_column(self, column: str = None) -> bool:
 1223        """
 1224        This function checks if a given column is a genotype column in a database.
 1225
 1226        :param column: The `column` parameter in the `is_genotype_column` method is a string that
 1227        represents the column name in a database table. This method checks if the specified column is a
 1228        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
 1229        method of
 1230        :type column: str
 1231        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
 1232        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
 1233        column name and returns the result. If the `column` parameter is None, it returns False.
 1234        """
 1235
 1236        if column is not None:
 1237            return Database(database=self.get_input()).is_genotype_column(column=column)
 1238        else:
 1239            return False
 1240
 1241    def get_verbose(self) -> bool:
 1242        """
 1243        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
 1244        exist
 1245
 1246        :return: The value of the key "verbose" in the config dictionary.
 1247        """
 1248        return self.get_config().get("verbose", False)
 1249
 1250    def get_connexion_format(self) -> str:
 1251        """
 1252        It returns the connexion format of the object.
 1253        :return: The connexion_format is being returned.
 1254        """
 1255        connexion_format = self.connexion_format
 1256        if connexion_format not in ["duckdb", "sqlite"]:
 1257            log.error(f"Unknown connexion format {connexion_format}")
 1258            raise ValueError(f"Unknown connexion format {connexion_format}")
 1259        else:
 1260            return connexion_format
 1261
 1262    def insert_file_to_table(
 1263        self,
 1264        file,
 1265        columns: str,
 1266        header_len: int = 0,
 1267        sep: str = "\t",
 1268        chunksize: int = 1000000,
 1269    ) -> None:
 1270        """
 1271        The function reads a file in chunks and inserts each chunk into a table based on the specified
 1272        database format.
 1273
 1274        :param file: The `file` parameter is the file that you want to load into a table. It should be
 1275        the path to the file on your system
 1276        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
 1277        should contain the names of the columns in the table where the data will be inserted. The column
 1278        names should be separated by commas within the string. For example, if you have columns named
 1279        "id", "name
 1280        :type columns: str
 1281        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
 1282        the number of lines to skip at the beginning of the file before reading the actual data. This
 1283        parameter allows you to skip any header information present in the file before processing the
 1284        data, defaults to 0
 1285        :type header_len: int (optional)
 1286        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
 1287        separator character that is used in the file being read. In this case, the default separator is
 1288        set to `\t`, which represents a tab character. You can change this parameter to a different
 1289        separator character if, defaults to \t
 1290        :type sep: str (optional)
 1291        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
 1292        when processing the file in chunks. In the provided code snippet, the default value for
 1293        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
 1294        to 1000000
 1295        :type chunksize: int (optional)
 1296        """
 1297
 1298        # Config
 1299        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
 1300        connexion_format = self.get_connexion_format()
 1301
 1302        log.debug("chunksize: " + str(chunksize))
 1303
 1304        if chunksize:
 1305            for chunk in pd.read_csv(
 1306                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
 1307            ):
 1308                if connexion_format in ["duckdb"]:
 1309                    sql_insert_into = (
 1310                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
 1311                    )
 1312                    self.conn.execute(sql_insert_into)
 1313                elif connexion_format in ["sqlite"]:
 1314                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
 1315
 1316    def load_data(
 1317        self,
 1318        input_file: str = None,
 1319        drop_variants_table: bool = False,
 1320        sample_size: int = 20480,
 1321    ) -> None:
 1322        """
 1323        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
 1324        table before loading the data and specify a sample size.
 1325
 1326        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
 1327        table
 1328        :type input_file: str
 1329        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
 1330        determines whether the variants table should be dropped before loading the data. If set to
 1331        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
 1332        not be dropped, defaults to False
 1333        :type drop_variants_table: bool (optional)
 1334        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
 1335        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
 1336        20480
 1337        :type sample_size: int (optional)
 1338        """
 1339
 1340        log.info("Loading...")
 1341
 1342        # change input file
 1343        if input_file:
 1344            self.set_input(input_file)
 1345            self.set_header()
 1346
 1347        # drop variants table
 1348        if drop_variants_table:
 1349            self.drop_variants_table()
 1350
 1351        # get table variants
 1352        table_variants = self.get_table_variants()
 1353
 1354        # Access
 1355        access = self.get_config().get("access", None)
 1356        log.debug(f"access: {access}")
 1357
 1358        # Input format and compress
 1359        input_format = self.get_input_format()
 1360        input_compressed = self.get_input_compressed()
 1361        log.debug(f"input_format: {input_format}")
 1362        log.debug(f"input_compressed: {input_compressed}")
 1363
 1364        # input_compressed_format
 1365        if input_compressed:
 1366            input_compressed_format = "gzip"
 1367        else:
 1368            input_compressed_format = "none"
 1369        log.debug(f"input_compressed_format: {input_compressed_format}")
 1370
 1371        # Connexion format
 1372        connexion_format = self.get_connexion_format()
 1373
 1374        # Sample size
 1375        if not sample_size:
 1376            sample_size = -1
 1377        log.debug(f"sample_size: {sample_size}")
 1378
 1379        # Load data
 1380        log.debug(f"Load Data from {input_format}")
 1381
 1382        # DuckDB connexion
 1383        if connexion_format in ["duckdb"]:
 1384
 1385            # Database already exists
 1386            if self.input_format in ["db", "duckdb"]:
 1387
 1388                if connexion_format in ["duckdb"]:
 1389                    log.debug(f"Input file format '{self.input_format}' duckDB")
 1390                else:
 1391                    log.error(
 1392                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1393                    )
 1394                    raise ValueError(
 1395                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1396                    )
 1397
 1398            # Load from existing database format
 1399            else:
 1400
 1401                try:
 1402                    # Create Table or View
 1403                    database = Database(database=self.input)
 1404                    sql_from = database.get_sql_from(sample_size=sample_size)
 1405
 1406                    if access in ["RO"]:
 1407                        sql_load = (
 1408                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
 1409                        )
 1410                    else:
 1411                        sql_load = (
 1412                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
 1413                        )
 1414                    self.conn.execute(sql_load)
 1415
 1416                except:
 1417                    # Format not available
 1418                    log.error(f"Input file format '{self.input_format}' not available")
 1419                    raise ValueError(
 1420                        f"Input file format '{self.input_format}' not available"
 1421                    )
 1422
 1423        # SQLite connexion
 1424        elif connexion_format in ["sqlite"] and input_format in [
 1425            "vcf",
 1426            "tsv",
 1427            "csv",
 1428            "psv",
 1429        ]:
 1430
 1431            # Main structure
 1432            structure = {
 1433                "#CHROM": "VARCHAR",
 1434                "POS": "INTEGER",
 1435                "ID": "VARCHAR",
 1436                "REF": "VARCHAR",
 1437                "ALT": "VARCHAR",
 1438                "QUAL": "VARCHAR",
 1439                "FILTER": "VARCHAR",
 1440                "INFO": "VARCHAR",
 1441            }
 1442
 1443            # Strcuture with samples
 1444            structure_complete = structure
 1445            if self.get_header_sample_list():
 1446                structure["FORMAT"] = "VARCHAR"
 1447                for sample in self.get_header_sample_list():
 1448                    structure_complete[sample] = "VARCHAR"
 1449
 1450            # Columns list for create and insert
 1451            sql_create_table_columns = []
 1452            sql_create_table_columns_list = []
 1453            for column in structure_complete:
 1454                column_type = structure_complete[column]
 1455                sql_create_table_columns.append(
 1456                    f'"{column}" {column_type} default NULL'
 1457                )
 1458                sql_create_table_columns_list.append(f'"{column}"')
 1459
 1460            # Create database
 1461            log.debug(f"Create Table {table_variants}")
 1462            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
 1463            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
 1464            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
 1465            self.conn.execute(sql_create_table)
 1466
 1467            # chunksize define length of file chunk load file
 1468            chunksize = 100000
 1469
 1470            # delimiter
 1471            delimiter = file_format_delimiters.get(input_format, "\t")
 1472
 1473            # Load the input file
 1474            with open(self.input, "rt") as input_file:
 1475
 1476                # Use the appropriate file handler based on the input format
 1477                if input_compressed:
 1478                    input_file = bgzf.open(self.input, "rt")
 1479                if input_format in ["vcf"]:
 1480                    header_len = self.get_header_length()
 1481                else:
 1482                    header_len = 0
 1483
 1484                # Insert the file contents into a table
 1485                self.insert_file_to_table(
 1486                    input_file,
 1487                    columns=sql_create_table_columns_list_sql,
 1488                    header_len=header_len,
 1489                    sep=delimiter,
 1490                    chunksize=chunksize,
 1491                )
 1492
 1493        else:
 1494            log.error(
 1495                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1496            )
 1497            raise ValueError(
 1498                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1499            )
 1500
 1501        # Explode INFOS fields into table fields
 1502        if self.get_explode_infos():
 1503            self.explode_infos(
 1504                prefix=self.get_explode_infos_prefix(),
 1505                fields=self.get_explode_infos_fields(),
 1506                force=True,
 1507            )
 1508
 1509        # Create index after insertion
 1510        self.create_indexes()
 1511
 1512    def get_explode_infos(self) -> bool:
 1513        """
 1514        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
 1515        to False if it is not set.
 1516        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
 1517        value. If the parameter is not present, it will return False.
 1518        """
 1519
 1520        return self.get_param().get("explode", {}).get("explode_infos", False)
 1521
 1522    def get_explode_infos_fields(
 1523        self,
 1524        explode_infos_fields: str = None,
 1525        remove_fields_not_in_header: bool = False,
 1526    ) -> list:
 1527        """
 1528        The `get_explode_infos_fields` function returns a list of exploded information fields based on
 1529        the input parameter `explode_infos_fields`.
 1530
 1531        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
 1532        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
 1533        comma-separated list of field names to explode
 1534        :type explode_infos_fields: str
 1535        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
 1536        flag that determines whether to remove fields that are not present in the header. If it is set
 1537        to `True`, any field that is not in the header will be excluded from the list of exploded
 1538        information fields. If it is set to `, defaults to False
 1539        :type remove_fields_not_in_header: bool (optional)
 1540        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
 1541        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
 1542        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
 1543        Otherwise, it returns a list of exploded information fields after removing any spaces and
 1544        splitting the string by commas.
 1545        """
 1546
 1547        # If no fields, get it in param
 1548        if not explode_infos_fields:
 1549            explode_infos_fields = (
 1550                self.get_param().get("explode", {}).get("explode_infos_fields", None)
 1551            )
 1552
 1553        # If no fields, defined as all fields in header using keyword
 1554        if not explode_infos_fields:
 1555            explode_infos_fields = "*"
 1556
 1557        # If fields list not empty
 1558        if explode_infos_fields:
 1559
 1560            # Input fields list
 1561            if isinstance(explode_infos_fields, str):
 1562                fields_input = explode_infos_fields.split(",")
 1563            elif isinstance(explode_infos_fields, list):
 1564                fields_input = explode_infos_fields
 1565            else:
 1566                fields_input = []
 1567
 1568            # Fields list without * keyword
 1569            fields_without_all = fields_input.copy()
 1570            if "*".casefold() in (item.casefold() for item in fields_without_all):
 1571                fields_without_all.remove("*")
 1572
 1573            # Fields in header
 1574            fields_in_header = sorted(list(set(self.get_header().infos)))
 1575
 1576            # Construct list of fields
 1577            fields_output = []
 1578            for field in fields_input:
 1579
 1580                # Strip field
 1581                field = field.strip()
 1582
 1583                # format keyword * in regex
 1584                if field.upper() in ["*"]:
 1585                    field = ".*"
 1586
 1587                # Find all fields with pattern
 1588                r = re.compile(rf"^{field}$")
 1589                fields_search = sorted(list(filter(r.match, fields_in_header)))
 1590
 1591                # Remove fields input from search
 1592                if field in fields_search:
 1593                    fields_search = [field]
 1594                elif fields_search != [field]:
 1595                    fields_search = sorted(
 1596                        list(set(fields_search).difference(fields_input))
 1597                    )
 1598
 1599                # If field is not in header (avoid not well formatted header)
 1600                if not fields_search and not remove_fields_not_in_header:
 1601                    fields_search = [field]
 1602
 1603                # Add found fields
 1604                for new_field in fields_search:
 1605                    # Add field, if not already exists, and if it is in header (if asked)
 1606                    if (
 1607                        new_field not in fields_output
 1608                        and (
 1609                            not remove_fields_not_in_header
 1610                            or new_field in fields_in_header
 1611                        )
 1612                        and new_field not in [".*"]
 1613                    ):
 1614                        fields_output.append(new_field)
 1615
 1616            return fields_output
 1617
 1618        else:
 1619
 1620            return []
 1621
 1622    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
 1623        """
 1624        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
 1625        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
 1626        not provided.
 1627
 1628        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
 1629        prefix to be used for exploding or expanding information
 1630        :type explode_infos_prefix: str
 1631        :return: the value of the variable `explode_infos_prefix`.
 1632        """
 1633
 1634        if not explode_infos_prefix:
 1635            explode_infos_prefix = (
 1636                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
 1637            )
 1638
 1639        return explode_infos_prefix
 1640
 1641    def add_column(
 1642        self,
 1643        table_name,
 1644        column_name,
 1645        column_type,
 1646        default_value=None,
 1647        drop: bool = False,
 1648    ) -> dict:
 1649        """
 1650        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
 1651        doesn't already exist.
 1652
 1653        :param table_name: The name of the table to which you want to add a column
 1654        :param column_name: The parameter "column_name" is the name of the column that you want to add
 1655        to the table
 1656        :param column_type: The `column_type` parameter specifies the data type of the column that you
 1657        want to add to the table. It should be a string that represents the desired data type, such as
 1658        "INTEGER", "TEXT", "REAL", etc
 1659        :param default_value: The `default_value` parameter is an optional parameter that specifies the
 1660        default value for the newly added column. If a default value is provided, it will be assigned to
 1661        the column for any existing rows that do not have a value for that column
 1662        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
 1663        if it already exists in the table. If `drop` is set to `True`, the function will drop the
 1664        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
 1665        to False
 1666        :type drop: bool (optional)
 1667        :return: a boolean value indicating whether the column was successfully added to the table.
 1668        """
 1669
 1670        # added
 1671        added = False
 1672        dropped = False
 1673
 1674        # Check if the column already exists in the table
 1675        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1676        columns = self.get_query_to_df(query).columns.tolist()
 1677        if column_name.upper() in [c.upper() for c in columns]:
 1678            log.debug(
 1679                f"The {column_name} column already exists in the {table_name} table"
 1680            )
 1681            if drop:
 1682                self.drop_column(table_name=table_name, column_name=column_name)
 1683                dropped = True
 1684            else:
 1685                return None
 1686        else:
 1687            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1688
 1689        # Add column in table
 1690        add_column_query = (
 1691            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
 1692        )
 1693        if default_value is not None:
 1694            add_column_query += f" DEFAULT {default_value}"
 1695        self.execute_query(add_column_query)
 1696        added = not dropped
 1697        log.debug(
 1698            f"The {column_name} column was successfully added to the {table_name} table"
 1699        )
 1700
 1701        if added:
 1702            added_column = {
 1703                "table_name": table_name,
 1704                "column_name": column_name,
 1705                "column_type": column_type,
 1706                "default_value": default_value,
 1707            }
 1708        else:
 1709            added_column = None
 1710
 1711        return added_column
 1712
 1713    def drop_column(
 1714        self, column: dict = None, table_name: str = None, column_name: str = None
 1715    ) -> bool:
 1716        """
 1717        The `drop_column` function drops a specified column from a given table in a database and returns
 1718        True if the column was successfully dropped, and False if the column does not exist in the
 1719        table.
 1720
 1721        :param column: The `column` parameter is a dictionary that contains information about the column
 1722        you want to drop. It has two keys:
 1723        :type column: dict
 1724        :param table_name: The `table_name` parameter is the name of the table from which you want to
 1725        drop a column
 1726        :type table_name: str
 1727        :param column_name: The `column_name` parameter is the name of the column that you want to drop
 1728        from the table
 1729        :type column_name: str
 1730        :return: a boolean value. It returns True if the column was successfully dropped from the table,
 1731        and False if the column does not exist in the table.
 1732        """
 1733
 1734        # Find column infos
 1735        if column:
 1736            if isinstance(column, dict):
 1737                table_name = column.get("table_name", None)
 1738                column_name = column.get("column_name", None)
 1739            elif isinstance(column, str):
 1740                table_name = self.get_table_variants()
 1741                column_name = column
 1742            else:
 1743                table_name = None
 1744                column_name = None
 1745
 1746        if not table_name and not column_name:
 1747            return False
 1748
 1749        # Removed
 1750        removed = False
 1751
 1752        # Check if the column already exists in the table
 1753        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1754        columns = self.get_query_to_df(query).columns.tolist()
 1755        if column_name in columns:
 1756            log.debug(f"The {column_name} column exists in the {table_name} table")
 1757        else:
 1758            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1759            return False
 1760
 1761        # Add column in table # ALTER TABLE integers DROP k
 1762        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
 1763        self.execute_query(add_column_query)
 1764        removed = True
 1765        log.debug(
 1766            f"The {column_name} column was successfully dropped to the {table_name} table"
 1767        )
 1768
 1769        return removed
 1770
 1771    def explode_infos(
 1772        self,
 1773        prefix: str = None,
 1774        create_index: bool = False,
 1775        fields: list = None,
 1776        force: bool = False,
 1777        proccess_all_fields_together: bool = False,
 1778        table: str = None,
 1779    ) -> list:
 1780        """
 1781        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
 1782        individual columns, returning a list of added columns.
 1783
 1784        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
 1785        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
 1786        `self.get_explode_infos_prefix()` as the prefix
 1787        :type prefix: str
 1788        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
 1789        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
 1790        `False`, indexes will not be created. The default value is `False`, defaults to False
 1791        :type create_index: bool (optional)
 1792        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
 1793        that you want to explode into individual columns. If this parameter is not provided, all INFO
 1794        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
 1795        a list to the `
 1796        :type fields: list
 1797        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
 1798        determines whether to drop and recreate a column if it already exists in the table. If `force`
 1799        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
 1800        defaults to False
 1801        :type force: bool (optional)
 1802        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
 1803        flag that determines whether to process all the INFO fields together or individually. If set to
 1804        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
 1805        be processed individually. The default value is, defaults to False
 1806        :type proccess_all_fields_together: bool (optional)
 1807        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
 1808        of the table where the exploded INFO fields will be added as individual columns. If you provide
 1809        a value for the `table` parameter, the function will use that table name. If the `table`
 1810        parameter is
 1811        :type table: str
 1812        :return: The `explode_infos` function returns a list of added columns.
 1813        """
 1814
 1815        # drop indexes
 1816        self.drop_indexes()
 1817
 1818        # connexion format
 1819        connexion_format = self.get_connexion_format()
 1820
 1821        # Access
 1822        access = self.get_config().get("access", None)
 1823
 1824        # Added columns
 1825        added_columns = []
 1826
 1827        if access not in ["RO"]:
 1828
 1829            # prefix
 1830            if prefix in [None, True] or not isinstance(prefix, str):
 1831                if self.get_explode_infos_prefix() not in [None, True]:
 1832                    prefix = self.get_explode_infos_prefix()
 1833                else:
 1834                    prefix = "INFO/"
 1835
 1836            # table variants
 1837            if table is not None:
 1838                table_variants = table
 1839            else:
 1840                table_variants = self.get_table_variants(clause="select")
 1841
 1842            # extra infos
 1843            try:
 1844                extra_infos = self.get_extra_infos()
 1845            except:
 1846                extra_infos = []
 1847
 1848            # Header infos
 1849            header_infos = self.get_header().infos
 1850
 1851            log.debug(
 1852                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
 1853            )
 1854
 1855            sql_info_alter_table_array = []
 1856
 1857            # Info fields to check
 1858            fields_list = list(header_infos)
 1859            if fields:
 1860                fields_list += fields
 1861            fields_list = set(fields_list)
 1862
 1863            # If no fields
 1864            if not fields:
 1865                fields = []
 1866
 1867            # Translate fields if patterns
 1868            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
 1869
 1870            for info in fields:
 1871
 1872                info_id_sql = prefix + info
 1873
 1874                if (
 1875                    info in fields_list
 1876                    or prefix + info in fields_list
 1877                    or info in extra_infos
 1878                ):
 1879
 1880                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
 1881
 1882                    if info in header_infos:
 1883                        info_type = header_infos[info].type
 1884                        info_num = header_infos[info].num
 1885                    else:
 1886                        info_type = "String"
 1887                        info_num = 0
 1888
 1889                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
 1890                    if info_num != 1:
 1891                        type_sql = "VARCHAR"
 1892
 1893                    # Add field
 1894                    added_column = self.add_column(
 1895                        table_name=table_variants,
 1896                        column_name=info_id_sql,
 1897                        column_type=type_sql,
 1898                        default_value="null",
 1899                        drop=force,
 1900                    )
 1901
 1902                    if added_column:
 1903                        added_columns.append(added_column)
 1904
 1905                    if added_column or force:
 1906
 1907                        # add field to index
 1908                        self.index_additionnal_fields.append(info_id_sql)
 1909
 1910                        # Update field array
 1911                        if connexion_format in ["duckdb"]:
 1912                            update_info_field = f"""
 1913                            "{info_id_sql}" =
 1914                                CASE
 1915                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
 1916                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
 1917                                END
 1918                            """
 1919                        elif connexion_format in ["sqlite"]:
 1920                            update_info_field = f"""
 1921                                "{info_id_sql}" =
 1922                                    CASE
 1923                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
 1924                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
 1925                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
 1926                                    END
 1927                            """
 1928
 1929                        sql_info_alter_table_array.append(update_info_field)
 1930
 1931            if sql_info_alter_table_array:
 1932
 1933                # By chromosomes
 1934                try:
 1935                    chromosomes_list = list(
 1936                        self.get_query_to_df(
 1937                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
 1938                        )["#CHROM"]
 1939                    )
 1940                except:
 1941                    chromosomes_list = [None]
 1942
 1943                for chrom in chromosomes_list:
 1944                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
 1945
 1946                    # Where clause
 1947                    where_clause = ""
 1948                    if chrom and len(chromosomes_list) > 1:
 1949                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
 1950
 1951                    # Update table
 1952                    if proccess_all_fields_together:
 1953                        sql_info_alter_table_array_join = ", ".join(
 1954                            sql_info_alter_table_array
 1955                        )
 1956                        if sql_info_alter_table_array_join:
 1957                            sql_info_alter_table = f"""
 1958                                UPDATE {table_variants}
 1959                                SET {sql_info_alter_table_array_join}
 1960                                {where_clause}
 1961                                """
 1962                            log.debug(
 1963                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
 1964                            )
 1965                            # log.debug(sql_info_alter_table)
 1966                            self.conn.execute(sql_info_alter_table)
 1967                    else:
 1968                        sql_info_alter_num = 0
 1969                        for sql_info_alter in sql_info_alter_table_array:
 1970                            sql_info_alter_num += 1
 1971                            sql_info_alter_table = f"""
 1972                                UPDATE {table_variants}
 1973                                SET {sql_info_alter}
 1974                                {where_clause}
 1975                                """
 1976                            log.debug(
 1977                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
 1978                            )
 1979                            # log.debug(sql_info_alter_table)
 1980                            self.conn.execute(sql_info_alter_table)
 1981
 1982        # create indexes
 1983        if create_index:
 1984            self.create_indexes()
 1985
 1986        return added_columns
 1987
 1988    def create_indexes(self) -> None:
 1989        """
 1990        Create indexes on the table after insertion
 1991        """
 1992
 1993        # Access
 1994        access = self.get_config().get("access", None)
 1995
 1996        # get table variants
 1997        table_variants = self.get_table_variants("FROM")
 1998
 1999        if self.get_indexing() and access not in ["RO"]:
 2000            # Create index
 2001            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
 2002            self.conn.execute(sql_create_table_index)
 2003            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
 2004            self.conn.execute(sql_create_table_index)
 2005            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
 2006            self.conn.execute(sql_create_table_index)
 2007            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
 2008            self.conn.execute(sql_create_table_index)
 2009            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
 2010            self.conn.execute(sql_create_table_index)
 2011            for field in self.index_additionnal_fields:
 2012                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
 2013                self.conn.execute(sql_create_table_index)
 2014
 2015    def drop_indexes(self) -> None:
 2016        """
 2017        Create indexes on the table after insertion
 2018        """
 2019
 2020        # Access
 2021        access = self.get_config().get("access", None)
 2022
 2023        # get table variants
 2024        table_variants = self.get_table_variants("FROM")
 2025
 2026        # Get database format
 2027        connexion_format = self.get_connexion_format()
 2028
 2029        if access not in ["RO"]:
 2030            if connexion_format in ["duckdb"]:
 2031                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
 2032            elif connexion_format in ["sqlite"]:
 2033                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
 2034
 2035            list_indexes = self.conn.execute(sql_list_indexes)
 2036            index_names = [row[0] for row in list_indexes.fetchall()]
 2037            for index in index_names:
 2038                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
 2039                self.conn.execute(sql_drop_table_index)
 2040
 2041    def read_vcf_header(self, f) -> list:
 2042        """
 2043        It reads the header of a VCF file and returns a list of the header lines
 2044
 2045        :param f: the file object
 2046        :return: The header lines of the VCF file.
 2047        """
 2048
 2049        header_list = []
 2050        for line in f:
 2051            header_list.append(line)
 2052            if line.startswith("#CHROM"):
 2053                break
 2054        return header_list
 2055
 2056    def read_vcf_header_file(self, file: str = None) -> list:
 2057        """
 2058        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
 2059        uncompressed files.
 2060
 2061        :param file: The `file` parameter is a string that represents the path to the VCF header file
 2062        that you want to read. It is an optional parameter, so if you don't provide a value, it will
 2063        default to `None`
 2064        :type file: str
 2065        :return: The function `read_vcf_header_file` returns a list.
 2066        """
 2067
 2068        if self.get_input_compressed(input_file=file):
 2069            with bgzf.open(file, "rt") as f:
 2070                return self.read_vcf_header(f=f)
 2071        else:
 2072            with open(file, "rt") as f:
 2073                return self.read_vcf_header(f=f)
 2074
 2075    def execute_query(self, query: str):
 2076        """
 2077        It takes a query as an argument, executes it, and returns the results
 2078
 2079        :param query: The query to be executed
 2080        :return: The result of the query is being returned.
 2081        """
 2082        if query:
 2083            return self.conn.execute(query)  # .fetchall()
 2084        else:
 2085            return None
 2086
 2087    def export_output(
 2088        self,
 2089        output_file: str | None = None,
 2090        output_header: str | None = None,
 2091        export_header: bool = True,
 2092        query: str | None = None,
 2093        parquet_partitions: list | None = None,
 2094        chunk_size: int | None = None,
 2095        threads: int | None = None,
 2096        sort: bool = False,
 2097        index: bool = False,
 2098        order_by: str | None = None,
 2099        fields_to_rename: dict | None = None,
 2100    ) -> bool:
 2101        """
 2102        The `export_output` function exports data from a VCF file to various formats, including VCF,
 2103        CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and
 2104        partitioning.
 2105
 2106        :param output_file: The `output_file` parameter is a string that specifies the name of the
 2107        output file where the exported data will be saved
 2108        :type output_file: str | None
 2109        :param output_header: The `output_header` parameter is a string that specifies the name of the
 2110        file where the header of the VCF file will be exported. If this parameter is not provided, the
 2111        header will be exported to a file with the same name as the `output_file` parameter, but with
 2112        the extension "
 2113        :type output_header: str | None
 2114        :param export_header: The `export_header` parameter is a boolean flag that determines whether
 2115        the header of a VCF file should be exported to a separate file or not. If `export_header` is
 2116        True, the header will be exported to a file. If `export_header` is False, the header will not
 2117        be, defaults to True
 2118        :type export_header: bool (optional)
 2119        :param query: The `query` parameter in the `export_output` function is an optional SQL query
 2120        that can be used to filter and select specific data from the VCF file before exporting it. If
 2121        provided, only the data that matches the query will be exported. This allows you to customize
 2122        the exported data based on
 2123        :type query: str | None
 2124        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
 2125        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
 2126        organize data in a hierarchical directory structure based on the values of one or more columns.
 2127        This can improve query performance when working with large datasets
 2128        :type parquet_partitions: list | None
 2129        :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when
 2130        exporting data in Parquet format. This parameter is used for partitioning the Parquet file into
 2131        multiple files. It helps in optimizing the export process by breaking down the data into
 2132        manageable chunks for processing and storage
 2133        :type chunk_size: int | None
 2134        :param threads: The `threads` parameter in the `export_output` function specifies the number of
 2135        threads to be used during the export process. It determines the level of parallelism and can
 2136        improve the performance of the export operation. If this parameter is not provided, the function
 2137        will use the default number of threads
 2138        :type threads: int | None
 2139        :param sort: The `sort` parameter in the `export_output` function is a boolean flag that
 2140        determines whether the output file should be sorted based on genomic coordinates of the
 2141        variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to
 2142        `False`,, defaults to False
 2143        :type sort: bool (optional)
 2144        :param index: The `index` parameter in the `export_output` function is a boolean flag that
 2145        determines whether an index should be created on the output file. If `index` is set to `True`,
 2146        an index will be created on the output file. If `index` is set to `False`, no, defaults to False
 2147        :type index: bool (optional)
 2148        :param order_by: The `order_by` parameter in the `export_output` function is a string that
 2149        specifies the column(s) to use for sorting the output file. This parameter is only applicable
 2150        when exporting data in VCF format. It allows you to specify the column(s) based on which the
 2151        output file should be
 2152        :type order_by: str | None
 2153        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the
 2154        mapping of field names to be renamed during the export process. This parameter allows you to
 2155        customize the output field names before exporting the data. Each key-value pair in the
 2156        dictionary represents the original field name as the key and the new field name
 2157        :type fields_to_rename: dict | None
 2158        :return: The `export_output` function returns a boolean value. It checks if the output file
 2159        exists and returns True if it does, or None if it doesn't.
 2160        """
 2161
 2162        # Log
 2163        log.info("Exporting...")
 2164
 2165        # Full path
 2166        output_file = full_path(output_file)
 2167        output_header = full_path(output_header)
 2168
 2169        # Config
 2170        config = self.get_config()
 2171
 2172        # Param
 2173        param = self.get_param()
 2174
 2175        # Tmp files to remove
 2176        tmp_to_remove = []
 2177
 2178        # If no output, get it
 2179        if not output_file:
 2180            output_file = self.get_output()
 2181
 2182        # If not threads
 2183        if not threads:
 2184            threads = self.get_threads()
 2185
 2186        # Rename fields
 2187        if not fields_to_rename:
 2188            fields_to_rename = param.get("export", {}).get("fields_to_rename", None)
 2189        self.rename_info_fields(fields_to_rename=fields_to_rename)
 2190
 2191        # Auto header name with extension
 2192        if export_header or output_header:
 2193            if not output_header:
 2194                output_header = f"{output_file}.hdr"
 2195            # Export header
 2196            self.export_header(output_file=output_file)
 2197
 2198        # Switch off export header if VCF output
 2199        output_file_type = get_file_format(output_file)
 2200        if output_file_type in ["vcf"]:
 2201            export_header = False
 2202            tmp_to_remove.append(output_header)
 2203
 2204        # Chunk size
 2205        if not chunk_size:
 2206            chunk_size = config.get("chunk_size", None)
 2207
 2208        # Parquet partition
 2209        if not parquet_partitions:
 2210            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
 2211        if parquet_partitions and isinstance(parquet_partitions, str):
 2212            parquet_partitions = parquet_partitions.split(",")
 2213
 2214        # Order by
 2215        if not order_by:
 2216            order_by = param.get("export", {}).get("order_by", "")
 2217
 2218        # Header in output
 2219        header_in_output = param.get("export", {}).get("include_header", False)
 2220
 2221        # Database
 2222        database_source = self.get_connexion()
 2223
 2224        # Connexion format
 2225        connexion_format = self.get_connexion_format()
 2226
 2227        # Explode infos
 2228        if self.get_explode_infos():
 2229            self.explode_infos(
 2230                prefix=self.get_explode_infos_prefix(),
 2231                fields=self.get_explode_infos_fields(),
 2232                force=False,
 2233            )
 2234
 2235        # if connexion_format in ["sqlite"] or query:
 2236        if connexion_format in ["sqlite"]:
 2237
 2238            # Export in Parquet
 2239            random_tmp = "".join(
 2240                random.choice(string.ascii_lowercase) for i in range(10)
 2241            )
 2242            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
 2243            tmp_to_remove.append(database_source)
 2244
 2245            # Table Variants
 2246            table_variants = self.get_table_variants()
 2247
 2248            # Create export query
 2249            sql_query_export_subquery = f"""
 2250                SELECT * FROM {table_variants}
 2251                """
 2252
 2253            # Write source file
 2254            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
 2255
 2256        # Create database
 2257        database = Database(
 2258            database=database_source,
 2259            table="variants",
 2260            header_file=output_header,
 2261            conn_config=self.get_connexion_config(),
 2262        )
 2263
 2264        # Existing colomns header
 2265        existing_columns_header = database.get_header_columns_from_database(query=query)
 2266
 2267        # Sample list
 2268        if output_file_type in ["vcf"]:
 2269            get_samples = self.get_samples()
 2270            get_samples_check = self.get_samples_check()
 2271            samples_force = get_samples is not None
 2272            sample_list = self.get_header_sample_list(
 2273                check=get_samples_check,
 2274                samples=get_samples,
 2275                samples_force=samples_force,
 2276            )
 2277        else:
 2278            sample_list = None
 2279
 2280        # Export file
 2281        database.export(
 2282            output_database=output_file,
 2283            output_header=output_header,
 2284            existing_columns_header=existing_columns_header,
 2285            parquet_partitions=parquet_partitions,
 2286            chunk_size=chunk_size,
 2287            threads=threads,
 2288            sort=sort,
 2289            index=index,
 2290            header_in_output=header_in_output,
 2291            order_by=order_by,
 2292            query=query,
 2293            export_header=export_header,
 2294            sample_list=sample_list,
 2295        )
 2296
 2297        # Remove
 2298        remove_if_exists(tmp_to_remove)
 2299
 2300        return (os.path.exists(output_file) or None) and (
 2301            os.path.exists(output_file) or None
 2302        )
 2303
 2304    def get_extra_infos(self, table: str = None) -> list:
 2305        """
 2306        The `get_extra_infos` function returns a list of columns that are in a specified table but not
 2307        in the header.
 2308
 2309        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
 2310        name of the table from which you want to retrieve the extra columns that are not present in the
 2311        header. If the `table` parameter is not provided when calling the function, it will default to
 2312        using the variants
 2313        :type table: str
 2314        :return: A list of columns that are in the specified table but not in the header of the table.
 2315        """
 2316
 2317        header_columns = []
 2318
 2319        if not table:
 2320            table = self.get_table_variants(clause="from")
 2321            header_columns = self.get_header_columns()
 2322
 2323        # Check all columns in the database
 2324        query = f""" SELECT * FROM {table} LIMIT 1 """
 2325        log.debug(f"query {query}")
 2326        table_columns = self.get_query_to_df(query).columns.tolist()
 2327        extra_columns = []
 2328
 2329        # Construct extra infos (not in header)
 2330        for column in table_columns:
 2331            if column not in header_columns:
 2332                extra_columns.append(column)
 2333
 2334        return extra_columns
 2335
 2336    def get_extra_infos_sql(self, table: str = None) -> str:
 2337        """
 2338        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
 2339        by double quotes
 2340
 2341        :param table: The name of the table to get the extra infos from. If None, the default table is
 2342        used
 2343        :type table: str
 2344        :return: A string of the extra infos
 2345        """
 2346
 2347        return ", ".join(
 2348            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
 2349        )
 2350
 2351    def export_header(
 2352        self,
 2353        header_name: str = None,
 2354        output_file: str = None,
 2355        output_file_ext: str = ".hdr",
 2356        clean_header: bool = True,
 2357        remove_chrom_line: bool = False,
 2358    ) -> str:
 2359        """
 2360        The `export_header` function takes a VCF file, extracts the header, modifies it according to
 2361        specified options, and writes it to a new file.
 2362
 2363        :param header_name: The `header_name` parameter is the name of the header file to be created. If
 2364        this parameter is not specified, the header will be written to the output file
 2365        :type header_name: str
 2366        :param output_file: The `output_file` parameter in the `export_header` function is used to
 2367        specify the name of the output file where the header will be written. If this parameter is not
 2368        provided, the header will be written to a temporary file
 2369        :type output_file: str
 2370        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
 2371        string that represents the extension of the output header file. By default, it is set to ".hdr"
 2372        if not specified by the user. This extension will be appended to the `output_file` name to
 2373        create the final, defaults to .hdr
 2374        :type output_file_ext: str (optional)
 2375        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
 2376        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
 2377        `True`, the function will clean the header by modifying certain lines based on a specific
 2378        pattern. If `clean_header`, defaults to True
 2379        :type clean_header: bool (optional)
 2380        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
 2381        boolean flag that determines whether the #CHROM line should be removed from the header before
 2382        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
 2383        defaults to False
 2384        :type remove_chrom_line: bool (optional)
 2385        :return: The function `export_header` returns the name of the temporary header file that is
 2386        created.
 2387        """
 2388
 2389        if not header_name and not output_file:
 2390            output_file = self.get_output()
 2391
 2392        if self.get_header():
 2393
 2394            # Get header object
 2395            header_obj = self.get_header()
 2396
 2397            # Create database
 2398            db_for_header = Database(database=self.get_input())
 2399
 2400            # Get real columns in the file
 2401            db_header_columns = db_for_header.get_columns()
 2402
 2403            with tempfile.TemporaryDirectory() as tmpdir:
 2404
 2405                # Write header file
 2406                header_file_tmp = os.path.join(tmpdir, "header")
 2407                f = open(header_file_tmp, "w")
 2408                vcf.Writer(f, header_obj)
 2409                f.close()
 2410
 2411                # Replace #CHROM line with rel columns
 2412                header_list = db_for_header.read_header_file(
 2413                    header_file=header_file_tmp
 2414                )
 2415                header_list[-1] = "\t".join(db_header_columns)
 2416
 2417                # Remove CHROM line
 2418                if remove_chrom_line:
 2419                    header_list.pop()
 2420
 2421                # Clean header
 2422                if clean_header:
 2423                    header_list_clean = []
 2424                    for head in header_list:
 2425                        # Clean head for malformed header
 2426                        head_clean = head
 2427                        head_clean = re.subn(
 2428                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
 2429                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
 2430                            head_clean,
 2431                            2,
 2432                        )[0]
 2433                        # Write header
 2434                        header_list_clean.append(head_clean)
 2435                    header_list = header_list_clean
 2436
 2437            tmp_header_name = output_file + output_file_ext
 2438
 2439            f = open(tmp_header_name, "w")
 2440            for line in header_list:
 2441                f.write(line)
 2442            f.close()
 2443
 2444        return tmp_header_name
 2445
 2446    def export_variant_vcf(
 2447        self,
 2448        vcf_file,
 2449        remove_info: bool = False,
 2450        add_samples: bool = True,
 2451        list_samples: list = [],
 2452        where_clause: str = "",
 2453        index: bool = False,
 2454        threads: int | None = None,
 2455    ) -> bool | None:
 2456        """
 2457        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
 2458        remove INFO field, add samples, and control compression and indexing.
 2459
 2460        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
 2461        written to. It is the output file that will contain the filtered VCF data based on the specified
 2462        parameters
 2463        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
 2464        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
 2465        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
 2466        in, defaults to False
 2467        :type remove_info: bool (optional)
 2468        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
 2469        the samples should be added to the VCF file or not. If set to True, the samples will be added.
 2470        If set to False, the samples will be removed. The default value is True, defaults to True
 2471        :type add_samples: bool (optional)
 2472        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
 2473        in the output VCF file. By default, all samples will be included. If you provide a list of
 2474        samples, only those samples will be included in the output file
 2475        :type list_samples: list
 2476        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
 2477        determines whether or not to create an index for the output VCF file. If `index` is set to
 2478        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
 2479        :type index: bool (optional)
 2480        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
 2481        number of threads to use for exporting the VCF file. It determines how many parallel threads
 2482        will be used during the export process. More threads can potentially speed up the export process
 2483        by utilizing multiple cores of the processor. If
 2484        :type threads: int | None
 2485        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
 2486        method with various parameters including the output file, query, threads, sort flag, and index
 2487        flag. The `export_output` method is responsible for exporting the VCF data based on the
 2488        specified parameters and configurations provided in the `export_variant_vcf` function.
 2489        """
 2490
 2491        # Config
 2492        config = self.get_config()
 2493
 2494        # Extract VCF
 2495        log.debug("Export VCF...")
 2496
 2497        # Table variants
 2498        table_variants = self.get_table_variants()
 2499
 2500        # Threads
 2501        if not threads:
 2502            threads = self.get_threads()
 2503
 2504        # Info fields
 2505        if remove_info:
 2506            if not isinstance(remove_info, str):
 2507                remove_info = "."
 2508            info_field = f"""'{remove_info}' as INFO"""
 2509        else:
 2510            info_field = "INFO"
 2511
 2512        # Samples fields
 2513        if add_samples:
 2514            if not list_samples:
 2515                list_samples = self.get_header_sample_list()
 2516            if list_samples:
 2517                samples_fields = " , FORMAT , " + " , ".join(
 2518                    [f""" "{sample}" """ for sample in list_samples]
 2519                )
 2520            else:
 2521                samples_fields = ""
 2522            log.debug(f"samples_fields: {samples_fields}")
 2523        else:
 2524            samples_fields = ""
 2525
 2526        # Where clause
 2527        if where_clause is None:
 2528            where_clause = ""
 2529
 2530        # Variants
 2531        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
 2532        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
 2533        log.debug(f"sql_query_select={sql_query_select}")
 2534
 2535        return self.export_output(
 2536            output_file=vcf_file,
 2537            output_header=None,
 2538            export_header=True,
 2539            query=sql_query_select,
 2540            parquet_partitions=None,
 2541            chunk_size=config.get("chunk_size", None),
 2542            threads=threads,
 2543            sort=True,
 2544            index=index,
 2545            order_by=None,
 2546        )
 2547
 2548    def run_commands(self, commands: list = [], threads: int = 1) -> None:
 2549        """
 2550        It takes a list of commands and runs them in parallel using the number of threads specified
 2551
 2552        :param commands: A list of commands to run
 2553        :param threads: The number of threads to use, defaults to 1 (optional)
 2554        """
 2555
 2556        run_parallel_commands(commands, threads)
 2557
 2558    def get_threads(self, default: int = 1) -> int:
 2559        """
 2560        This function returns the number of threads to use for a job, with a default value of 1 if not
 2561        specified.
 2562
 2563        :param default: The `default` parameter in the `get_threads` method is used to specify the
 2564        default number of threads to use if no specific value is provided. If no value is provided for
 2565        the `threads` parameter in the configuration or input parameters, the `default` value will be
 2566        used, defaults to 1
 2567        :type default: int (optional)
 2568        :return: the number of threads to use for the current job.
 2569        """
 2570
 2571        # Config
 2572        config = self.get_config()
 2573
 2574        # Param
 2575        param = self.get_param()
 2576
 2577        # Input threads
 2578        input_thread = param.get("threads", config.get("threads", None))
 2579
 2580        # Check threads
 2581        if not input_thread:
 2582            threads = default
 2583        elif int(input_thread) <= 0:
 2584            threads = os.cpu_count()
 2585        else:
 2586            threads = int(input_thread)
 2587        return threads
 2588
 2589    def get_memory(self, default: str = None) -> str:
 2590        """
 2591        This function retrieves the memory value from parameters or configuration with a default value
 2592        if not found.
 2593
 2594        :param default: The `get_memory` function takes in a default value as a string parameter. This
 2595        default value is used as a fallback in case the `memory` parameter is not provided in the
 2596        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
 2597        the function
 2598        :type default: str
 2599        :return: The `get_memory` function returns a string value representing the memory parameter. If
 2600        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
 2601        return the default value provided as an argument to the function.
 2602        """
 2603
 2604        # Config
 2605        config = self.get_config()
 2606
 2607        # Param
 2608        param = self.get_param()
 2609
 2610        # Input threads
 2611        input_memory = param.get("memory", config.get("memory", None))
 2612
 2613        # Check threads
 2614        if input_memory:
 2615            memory = input_memory
 2616        else:
 2617            memory = default
 2618
 2619        return memory
 2620
 2621    def update_from_vcf(self, vcf_file: str) -> None:
 2622        """
 2623        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
 2624
 2625        :param vcf_file: the path to the VCF file
 2626        """
 2627
 2628        connexion_format = self.get_connexion_format()
 2629
 2630        if connexion_format in ["duckdb"]:
 2631            self.update_from_vcf_duckdb(vcf_file)
 2632        elif connexion_format in ["sqlite"]:
 2633            self.update_from_vcf_sqlite(vcf_file)
 2634
 2635    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
 2636        """
 2637        It takes a VCF file and updates the INFO column of the variants table in the database with the
 2638        INFO column of the VCF file
 2639
 2640        :param vcf_file: the path to the VCF file
 2641        """
 2642
 2643        # varaints table
 2644        table_variants = self.get_table_variants()
 2645
 2646        # Loading VCF into temporaire table
 2647        skip = self.get_header_length(file=vcf_file)
 2648        vcf_df = pd.read_csv(
 2649            vcf_file,
 2650            sep="\t",
 2651            engine="c",
 2652            skiprows=skip,
 2653            header=0,
 2654            low_memory=False,
 2655        )
 2656        sql_query_update = f"""
 2657        UPDATE {table_variants} as table_variants
 2658            SET INFO = concat(
 2659                            CASE
 2660                                WHEN INFO NOT IN ('', '.')
 2661                                THEN INFO
 2662                                ELSE ''
 2663                            END,
 2664                            (
 2665                                SELECT 
 2666                                    concat(
 2667                                        CASE
 2668                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
 2669                                            THEN ';'
 2670                                            ELSE ''
 2671                                        END
 2672                                        ,
 2673                                        CASE
 2674                                            WHEN table_parquet.INFO NOT IN ('','.')
 2675                                            THEN table_parquet.INFO
 2676                                            ELSE ''
 2677                                        END
 2678                                    )
 2679                                FROM vcf_df as table_parquet
 2680                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
 2681                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
 2682                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 2683                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
 2684                                        AND table_parquet.INFO NOT IN ('','.')
 2685                            )
 2686                        )
 2687            ;
 2688            """
 2689        self.conn.execute(sql_query_update)
 2690
 2691    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
 2692        """
 2693        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
 2694        table, then updates the INFO column of the variants table with the INFO column of the temporary
 2695        table
 2696
 2697        :param vcf_file: The path to the VCF file you want to update the database with
 2698        """
 2699
 2700        # Create a temporary table for the VCF
 2701        table_vcf = "tmp_vcf"
 2702        sql_create = (
 2703            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
 2704        )
 2705        self.conn.execute(sql_create)
 2706
 2707        # Loading VCF into temporaire table
 2708        vcf_df = pd.read_csv(
 2709            vcf_file, sep="\t", comment="#", header=None, low_memory=False
 2710        )
 2711        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
 2712        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
 2713
 2714        # Update table 'variants' with VCF data
 2715        # warning: CONCAT as || operator
 2716        sql_query_update = f"""
 2717            UPDATE variants as table_variants
 2718            SET INFO = CASE
 2719                            WHEN INFO NOT IN ('', '.')
 2720                            THEN INFO
 2721                            ELSE ''
 2722                        END ||
 2723                        (
 2724                        SELECT 
 2725                            CASE 
 2726                                WHEN table_variants.INFO NOT IN ('','.') 
 2727                                    AND table_vcf.INFO NOT IN ('','.')  
 2728                                THEN ';' 
 2729                                ELSE '' 
 2730                            END || 
 2731                            CASE 
 2732                                WHEN table_vcf.INFO NOT IN ('','.') 
 2733                                THEN table_vcf.INFO 
 2734                                ELSE '' 
 2735                            END
 2736                        FROM {table_vcf} as table_vcf
 2737                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
 2738                            AND table_vcf.\"POS\" = table_variants.\"POS\"
 2739                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
 2740                            AND table_vcf.\"REF\" = table_variants.\"REF\"
 2741                        )
 2742        """
 2743        self.conn.execute(sql_query_update)
 2744
 2745        # Drop temporary table
 2746        sql_drop = f"DROP TABLE {table_vcf}"
 2747        self.conn.execute(sql_drop)
 2748
 2749    def drop_variants_table(self) -> None:
 2750        """
 2751        > This function drops the variants table
 2752        """
 2753
 2754        table_variants = self.get_table_variants()
 2755        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
 2756        self.conn.execute(sql_table_variants)
 2757
 2758    def set_variant_id(
 2759        self, variant_id_column: str = "variant_id", force: bool = None
 2760    ) -> str:
 2761        """
 2762        It adds a column to the variants table called `variant_id` and populates it with a hash of the
 2763        `#CHROM`, `POS`, `REF`, and `ALT` columns
 2764
 2765        :param variant_id_column: The name of the column to be created in the variants table, defaults
 2766        to variant_id
 2767        :type variant_id_column: str (optional)
 2768        :param force: If True, the variant_id column will be created even if it already exists
 2769        :type force: bool
 2770        :return: The name of the column that contains the variant_id
 2771        """
 2772
 2773        # Assembly
 2774        assembly = self.get_param().get(
 2775            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 2776        )
 2777
 2778        # INFO/Tag prefix
 2779        prefix = self.get_explode_infos_prefix()
 2780
 2781        # Explode INFO/SVTYPE
 2782        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
 2783
 2784        # variants table
 2785        table_variants = self.get_table_variants()
 2786
 2787        # variant_id column
 2788        if not variant_id_column:
 2789            variant_id_column = "variant_id"
 2790
 2791        # Creta variant_id column
 2792        if "variant_id" not in self.get_extra_infos() or force:
 2793
 2794            # Create column
 2795            self.add_column(
 2796                table_name=table_variants,
 2797                column_name=variant_id_column,
 2798                column_type="UBIGINT",
 2799                default_value="0",
 2800            )
 2801
 2802            # Update column
 2803            self.conn.execute(
 2804                f"""
 2805                    UPDATE {table_variants}
 2806                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
 2807                """
 2808            )
 2809
 2810        # Remove added columns
 2811        for added_column in added_columns:
 2812            self.drop_column(column=added_column)
 2813
 2814        # return variant_id column name
 2815        return variant_id_column
 2816
 2817    def get_variant_id_column(
 2818        self, variant_id_column: str = "variant_id", force: bool = None
 2819    ) -> str:
 2820        """
 2821        This function returns the variant_id column name
 2822
 2823        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
 2824        defaults to variant_id
 2825        :type variant_id_column: str (optional)
 2826        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
 2827        False, will only set the variant_id if it is not already set. If None, will set the variant_id
 2828        if it is not already set, or if it is set
 2829        :type force: bool
 2830        :return: The variant_id column name.
 2831        """
 2832
 2833        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
 2834
 2835    ###
 2836    # Annotation
 2837    ###
 2838
 2839    def scan_databases(
 2840        self,
 2841        database_formats: list = ["parquet"],
 2842        database_releases: list = ["current"],
 2843    ) -> dict:
 2844        """
 2845        The function `scan_databases` scans for available databases based on specified formats and
 2846        releases.
 2847
 2848        :param database_formats: The `database_formats` parameter is a list that specifies the formats
 2849        of the databases to be scanned. In this case, the accepted format is "parquet"
 2850        :type database_formats: list ["parquet"]
 2851        :param database_releases: The `database_releases` parameter is a list that specifies the
 2852        releases of the databases to be scanned. In the provided function, the default value for
 2853        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
 2854        databases that are in the "current"
 2855        :type database_releases: list
 2856        :return: The function `scan_databases` returns a dictionary containing information about
 2857        databases that match the specified formats and releases.
 2858        """
 2859
 2860        # Config
 2861        config = self.get_config()
 2862
 2863        # Param
 2864        param = self.get_param()
 2865
 2866        # Param - Assembly
 2867        assembly = param.get("assembly", config.get("assembly", None))
 2868        if not assembly:
 2869            assembly = DEFAULT_ASSEMBLY
 2870            log.warning(f"Default assembly '{assembly}'")
 2871
 2872        # Scan for availabled databases
 2873        log.info(
 2874            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
 2875        )
 2876        databases_infos_dict = databases_infos(
 2877            database_folder_releases=database_releases,
 2878            database_formats=database_formats,
 2879            assembly=assembly,
 2880            config=config,
 2881        )
 2882        log.info(
 2883            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
 2884        )
 2885
 2886        return databases_infos_dict
 2887
 2888    def annotation(self) -> None:
 2889        """
 2890        It annotates the VCF file with the annotations specified in the config file.
 2891        """
 2892
 2893        # Config
 2894        config = self.get_config()
 2895
 2896        # Param
 2897        param = self.get_param()
 2898
 2899        # Param - Assembly
 2900        assembly = param.get("assembly", config.get("assembly", None))
 2901        if not assembly:
 2902            assembly = DEFAULT_ASSEMBLY
 2903            log.warning(f"Default assembly '{assembly}'")
 2904
 2905        # annotations databases folders
 2906        annotations_databases = set(
 2907            config.get("folders", {})
 2908            .get("databases", {})
 2909            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
 2910            + config.get("folders", {})
 2911            .get("databases", {})
 2912            .get("parquet", ["~/howard/databases/parquet/current"])
 2913            + config.get("folders", {})
 2914            .get("databases", {})
 2915            .get("bcftools", ["~/howard/databases/bcftools/current"])
 2916        )
 2917
 2918        # Get param annotations
 2919        if param.get("annotations", None) and isinstance(
 2920            param.get("annotations", None), str
 2921        ):
 2922            log.debug(param.get("annotations", None))
 2923            param_annotation_list = param.get("annotations").split(",")
 2924        else:
 2925            param_annotation_list = []
 2926
 2927        # Each tools param
 2928        if param.get("annotation_parquet", None) != None:
 2929            log.debug(
 2930                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
 2931            )
 2932            if isinstance(param.get("annotation_parquet", None), list):
 2933                param_annotation_list.append(",".join(param.get("annotation_parquet")))
 2934            else:
 2935                param_annotation_list.append(param.get("annotation_parquet"))
 2936        if param.get("annotation_snpsift", None) != None:
 2937            if isinstance(param.get("annotation_snpsift", None), list):
 2938                param_annotation_list.append(
 2939                    "snpsift:"
 2940                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
 2941                )
 2942            else:
 2943                param_annotation_list.append(
 2944                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
 2945                )
 2946        if param.get("annotation_snpeff", None) != None:
 2947            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
 2948        if param.get("annotation_bcftools", None) != None:
 2949            if isinstance(param.get("annotation_bcftools", None), list):
 2950                param_annotation_list.append(
 2951                    "bcftools:"
 2952                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
 2953                )
 2954            else:
 2955                param_annotation_list.append(
 2956                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
 2957                )
 2958        if param.get("annotation_annovar", None) != None:
 2959            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
 2960        if param.get("annotation_exomiser", None) != None:
 2961            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
 2962        if param.get("annotation_splice", None) != None:
 2963            param_annotation_list.append("splice:" + param.get("annotation_splice"))
 2964
 2965        # Merge param annotations list
 2966        param["annotations"] = ",".join(param_annotation_list)
 2967
 2968        # debug
 2969        log.debug(f"param_annotations={param['annotations']}")
 2970
 2971        if param.get("annotations"):
 2972
 2973            # Log
 2974            # log.info("Annotations - Check annotation parameters")
 2975
 2976            if not "annotation" in param:
 2977                param["annotation"] = {}
 2978
 2979            # List of annotations parameters
 2980            annotations_list_input = {}
 2981            if isinstance(param.get("annotations", None), str):
 2982                annotation_file_list = [
 2983                    value for value in param.get("annotations", "").split(",")
 2984                ]
 2985                for annotation_file in annotation_file_list:
 2986                    annotations_list_input[annotation_file.strip()] = {"INFO": None}
 2987            else:
 2988                annotations_list_input = param.get("annotations", {})
 2989
 2990            log.info(f"Quick Annotations:")
 2991            for annotation_key in list(annotations_list_input.keys()):
 2992                log.info(f"   {annotation_key}")
 2993
 2994            # List of annotations and associated fields
 2995            annotations_list = {}
 2996
 2997            for annotation_file in annotations_list_input:
 2998
 2999                # Explode annotations if ALL
 3000                if (
 3001                    annotation_file.upper() == "ALL"
 3002                    or annotation_file.upper().startswith("ALL:")
 3003                ):
 3004
 3005                    # check ALL parameters (formats, releases)
 3006                    annotation_file_split = annotation_file.split(":")
 3007                    database_formats = "parquet"
 3008                    database_releases = "current"
 3009                    for annotation_file_option in annotation_file_split[1:]:
 3010                        database_all_options_split = annotation_file_option.split("=")
 3011                        if database_all_options_split[0] == "format":
 3012                            database_formats = database_all_options_split[1].split("+")
 3013                        if database_all_options_split[0] == "release":
 3014                            database_releases = database_all_options_split[1].split("+")
 3015
 3016                    # Scan for availabled databases
 3017                    databases_infos_dict = self.scan_databases(
 3018                        database_formats=database_formats,
 3019                        database_releases=database_releases,
 3020                    )
 3021
 3022                    # Add found databases in annotation parameters
 3023                    for database_infos in databases_infos_dict.keys():
 3024                        annotations_list[database_infos] = {"INFO": None}
 3025
 3026                else:
 3027                    annotations_list[annotation_file] = annotations_list_input[
 3028                        annotation_file
 3029                    ]
 3030
 3031            # Check each databases
 3032            if len(annotations_list):
 3033
 3034                log.info(
 3035                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
 3036                )
 3037
 3038                for annotation_file in annotations_list:
 3039
 3040                    # Init
 3041                    annotations = annotations_list.get(annotation_file, None)
 3042
 3043                    # Annotation snpEff
 3044                    if annotation_file.startswith("snpeff"):
 3045
 3046                        log.debug(f"Quick Annotation snpEff")
 3047
 3048                        if "snpeff" not in param["annotation"]:
 3049                            param["annotation"]["snpeff"] = {}
 3050
 3051                        if "options" not in param["annotation"]["snpeff"]:
 3052                            param["annotation"]["snpeff"]["options"] = ""
 3053
 3054                        # snpEff options in annotations
 3055                        param["annotation"]["snpeff"]["options"] = "".join(
 3056                            annotation_file.split(":")[1:]
 3057                        )
 3058
 3059                    # Annotation Annovar
 3060                    elif annotation_file.startswith("annovar"):
 3061
 3062                        log.debug(f"Quick Annotation Annovar")
 3063
 3064                        if "annovar" not in param["annotation"]:
 3065                            param["annotation"]["annovar"] = {}
 3066
 3067                        if "annotations" not in param["annotation"]["annovar"]:
 3068                            param["annotation"]["annovar"]["annotations"] = {}
 3069
 3070                        # Options
 3071                        annotation_file_split = annotation_file.split(":")
 3072                        for annotation_file_annotation in annotation_file_split[1:]:
 3073                            if annotation_file_annotation:
 3074                                param["annotation"]["annovar"]["annotations"][
 3075                                    annotation_file_annotation
 3076                                ] = annotations
 3077
 3078                    # Annotation Exomiser
 3079                    elif annotation_file.startswith("exomiser"):
 3080
 3081                        log.debug(f"Quick Annotation Exomiser")
 3082
 3083                        param["annotation"]["exomiser"] = params_string_to_dict(
 3084                            annotation_file
 3085                        )
 3086
 3087                    # Annotation Splice
 3088                    elif annotation_file.startswith("splice"):
 3089
 3090                        log.debug(f"Quick Annotation Splice")
 3091
 3092                        param["annotation"]["splice"] = params_string_to_dict(
 3093                            annotation_file
 3094                        )
 3095
 3096                    # Annotation Parquet or BCFTOOLS
 3097                    else:
 3098
 3099                        # Tools detection
 3100                        if annotation_file.startswith("bcftools:"):
 3101                            annotation_tool_initial = "bcftools"
 3102                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3103                        elif annotation_file.startswith("snpsift:"):
 3104                            annotation_tool_initial = "snpsift"
 3105                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3106                        elif annotation_file.startswith("bigwig:"):
 3107                            annotation_tool_initial = "bigwig"
 3108                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3109                        else:
 3110                            annotation_tool_initial = None
 3111
 3112                        # list of files
 3113                        annotation_file_list = annotation_file.replace("+", ":").split(
 3114                            ":"
 3115                        )
 3116
 3117                        for annotation_file in annotation_file_list:
 3118
 3119                            if annotation_file:
 3120
 3121                                # Annotation tool initial
 3122                                annotation_tool = annotation_tool_initial
 3123
 3124                                # Find file
 3125                                annotation_file_found = None
 3126
 3127                                if os.path.exists(annotation_file):
 3128                                    annotation_file_found = annotation_file
 3129                                elif os.path.exists(full_path(annotation_file)):
 3130                                    annotation_file_found = full_path(annotation_file)
 3131                                else:
 3132                                    # Find within assembly folders
 3133                                    for annotations_database in annotations_databases:
 3134                                        found_files = find_all(
 3135                                            annotation_file,
 3136                                            os.path.join(
 3137                                                annotations_database, assembly
 3138                                            ),
 3139                                        )
 3140                                        if len(found_files) > 0:
 3141                                            annotation_file_found = found_files[0]
 3142                                            break
 3143                                    if not annotation_file_found and not assembly:
 3144                                        # Find within folders
 3145                                        for (
 3146                                            annotations_database
 3147                                        ) in annotations_databases:
 3148                                            found_files = find_all(
 3149                                                annotation_file, annotations_database
 3150                                            )
 3151                                            if len(found_files) > 0:
 3152                                                annotation_file_found = found_files[0]
 3153                                                break
 3154                                log.debug(
 3155                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
 3156                                )
 3157
 3158                                # Full path
 3159                                annotation_file_found = full_path(annotation_file_found)
 3160
 3161                                if annotation_file_found:
 3162
 3163                                    database = Database(database=annotation_file_found)
 3164                                    quick_annotation_format = database.get_format()
 3165                                    quick_annotation_is_compressed = (
 3166                                        database.is_compressed()
 3167                                    )
 3168                                    quick_annotation_is_indexed = os.path.exists(
 3169                                        f"{annotation_file_found}.tbi"
 3170                                    )
 3171                                    bcftools_preference = False
 3172
 3173                                    # Check Annotation Tool
 3174                                    if not annotation_tool:
 3175                                        if (
 3176                                            bcftools_preference
 3177                                            and quick_annotation_format
 3178                                            in ["vcf", "bed"]
 3179                                            and quick_annotation_is_compressed
 3180                                            and quick_annotation_is_indexed
 3181                                        ):
 3182                                            annotation_tool = "bcftools"
 3183                                        elif quick_annotation_format in [
 3184                                            "vcf",
 3185                                            "bed",
 3186                                            "tsv",
 3187                                            "tsv",
 3188                                            "csv",
 3189                                            "json",
 3190                                            "tbl",
 3191                                            "parquet",
 3192                                            "duckdb",
 3193                                        ]:
 3194                                            annotation_tool = "parquet"
 3195                                        elif quick_annotation_format in ["bw"]:
 3196                                            annotation_tool = "bigwig"
 3197                                        else:
 3198                                            log.error(
 3199                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3200                                            )
 3201                                            raise ValueError(
 3202                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3203                                            )
 3204
 3205                                    log.debug(
 3206                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
 3207                                    )
 3208
 3209                                    # Annotation Tool dispatch
 3210                                    if annotation_tool:
 3211                                        if annotation_tool not in param["annotation"]:
 3212                                            param["annotation"][annotation_tool] = {}
 3213                                        if (
 3214                                            "annotations"
 3215                                            not in param["annotation"][annotation_tool]
 3216                                        ):
 3217                                            param["annotation"][annotation_tool][
 3218                                                "annotations"
 3219                                            ] = {}
 3220                                        param["annotation"][annotation_tool][
 3221                                            "annotations"
 3222                                        ][annotation_file_found] = annotations
 3223
 3224                                else:
 3225                                    log.warning(
 3226                                        f"Quick Annotation File {annotation_file} does NOT exist"
 3227                                    )
 3228
 3229                self.set_param(param)
 3230
 3231        if param.get("annotation", None):
 3232            log.info("Annotations")
 3233            if param.get("annotation", {}).get("parquet", None):
 3234                log.info("Annotations 'parquet'...")
 3235                self.annotation_parquet()
 3236            if param.get("annotation", {}).get("bcftools", None):
 3237                log.info("Annotations 'bcftools'...")
 3238                self.annotation_bcftools()
 3239            if param.get("annotation", {}).get("snpsift", None):
 3240                log.info("Annotations 'snpsift'...")
 3241                self.annotation_snpsift()
 3242            if param.get("annotation", {}).get("bigwig", None):
 3243                log.info("Annotations 'bigwig'...")
 3244                self.annotation_bigwig()
 3245            if param.get("annotation", {}).get("annovar", None):
 3246                log.info("Annotations 'annovar'...")
 3247                self.annotation_annovar()
 3248            if param.get("annotation", {}).get("snpeff", None):
 3249                log.info("Annotations 'snpeff'...")
 3250                self.annotation_snpeff()
 3251            if param.get("annotation", {}).get("exomiser", None) is not None:
 3252                log.info("Annotations 'exomiser'...")
 3253                self.annotation_exomiser()
 3254            if param.get("annotation", {}).get("splice", None) is not None:
 3255                log.info("Annotations 'splice' ...")
 3256                self.annotation_splice()
 3257
 3258        # Explode INFOS fields into table fields
 3259        if self.get_explode_infos():
 3260            self.explode_infos(
 3261                prefix=self.get_explode_infos_prefix(),
 3262                fields=self.get_explode_infos_fields(),
 3263                force=True,
 3264            )
 3265
 3266    def annotation_bigwig(self, threads: int = None) -> None:
 3267        """
 3268        The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases.
 3269
 3270        :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the
 3271        number of threads to be used for parallel processing during the annotation process. If the
 3272        `threads` parameter is not provided, the method will attempt to determine the optimal number of
 3273        threads to use based on the system configuration
 3274        :type threads: int
 3275        :return: True
 3276        """
 3277
 3278        # DEBUG
 3279        log.debug("Start annotation with bigwig databases")
 3280
 3281        # # Threads
 3282        # if not threads:
 3283        #     threads = self.get_threads()
 3284        # log.debug("Threads: " + str(threads))
 3285
 3286        # Config
 3287        config = self.get_config()
 3288        log.debug("Config: " + str(config))
 3289
 3290        # Config - BCFTools databases folders
 3291        databases_folders = set(
 3292            self.get_config()
 3293            .get("folders", {})
 3294            .get("databases", {})
 3295            .get("annotations", ["."])
 3296            + self.get_config()
 3297            .get("folders", {})
 3298            .get("databases", {})
 3299            .get("bigwig", ["."])
 3300        )
 3301        log.debug("Databases annotations: " + str(databases_folders))
 3302
 3303        # Param
 3304        annotations = (
 3305            self.get_param()
 3306            .get("annotation", {})
 3307            .get("bigwig", {})
 3308            .get("annotations", None)
 3309        )
 3310        log.debug("Annotations: " + str(annotations))
 3311
 3312        # Assembly
 3313        assembly = self.get_param().get(
 3314            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3315        )
 3316
 3317        # Data
 3318        table_variants = self.get_table_variants()
 3319
 3320        # Check if not empty
 3321        log.debug("Check if not empty")
 3322        sql_query_chromosomes = (
 3323            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3324        )
 3325        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3326        if not sql_query_chromosomes_df["count"][0]:
 3327            log.info(f"VCF empty")
 3328            return
 3329
 3330        # VCF header
 3331        vcf_reader = self.get_header()
 3332        log.debug("Initial header: " + str(vcf_reader.infos))
 3333
 3334        # Existing annotations
 3335        for vcf_annotation in self.get_header().infos:
 3336
 3337            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3338            log.debug(
 3339                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3340            )
 3341
 3342        if annotations:
 3343
 3344            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3345
 3346                # Export VCF file
 3347                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3348
 3349                # annotation_bigwig_config
 3350                annotation_bigwig_config_list = []
 3351
 3352                for annotation in annotations:
 3353                    annotation_fields = annotations[annotation]
 3354
 3355                    # Annotation Name
 3356                    annotation_name = os.path.basename(annotation)
 3357
 3358                    if not annotation_fields:
 3359                        annotation_fields = {"INFO": None}
 3360
 3361                    log.debug(f"Annotation '{annotation_name}'")
 3362                    log.debug(
 3363                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3364                    )
 3365
 3366                    # Create Database
 3367                    database = Database(
 3368                        database=annotation,
 3369                        databases_folders=databases_folders,
 3370                        assembly=assembly,
 3371                    )
 3372
 3373                    # Find files
 3374                    db_file = database.get_database()
 3375                    db_file = full_path(db_file)
 3376                    db_hdr_file = database.get_header_file()
 3377                    db_hdr_file = full_path(db_hdr_file)
 3378                    db_file_type = database.get_format()
 3379
 3380                    # If db_file is http ?
 3381                    if database.get_database().startswith("http"):
 3382
 3383                        # Datbase is HTTP URL
 3384                        db_file_is_http = True
 3385
 3386                        # DB file keep as URL
 3387                        db_file = database.get_database()
 3388                        log.warning(
 3389                            f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)"
 3390                        )
 3391
 3392                        # Retrieve automatic annotation field name
 3393                        annotation_field = clean_annotation_field(
 3394                            os.path.basename(db_file).replace(".bw", "")
 3395                        )
 3396                        log.debug(
 3397                            f"Create header file with annotation field '{annotation_field}' is an HTTP URL"
 3398                        )
 3399
 3400                        # Create automatic header file
 3401                        db_hdr_file = os.path.join(tmp_dir, "header.hdr")
 3402                        with open(db_hdr_file, "w") as f:
 3403                            f.write("##fileformat=VCFv4.2\n")
 3404                            f.write(
 3405                                f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n"""
 3406                            )
 3407                            f.write(f"#CHROM	START	END	{annotation_field}\n")
 3408
 3409                    else:
 3410
 3411                        # Datbase is NOT HTTP URL
 3412                        db_file_is_http = False
 3413
 3414                    # Check index - try to create if not exists
 3415                    if (
 3416                        db_file is None
 3417                        or db_hdr_file is None
 3418                        or (not os.path.exists(db_file) and not db_file_is_http)
 3419                        or not os.path.exists(db_hdr_file)
 3420                        or not db_file_type in ["bw"]
 3421                    ):
 3422                        # if False:
 3423                        log.error("Annotation failed: database not valid")
 3424                        log.error(f"Annotation annotation file: {db_file}")
 3425                        log.error(f"Annotation annotation file type: {db_file_type}")
 3426                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3427                        raise ValueError(
 3428                            f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}"
 3429                        )
 3430                    else:
 3431
 3432                        # Log
 3433                        log.debug(
 3434                            f"Annotation '{annotation}' - file: "
 3435                            + str(db_file)
 3436                            + " and "
 3437                            + str(db_hdr_file)
 3438                        )
 3439
 3440                        # Load header as VCF object
 3441                        db_hdr_vcf = Variants(input=db_hdr_file)
 3442                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3443                        log.debug(
 3444                            "Annotation database header: "
 3445                            + str(db_hdr_vcf_header_infos)
 3446                        )
 3447
 3448                        # For all fields in database
 3449                        annotation_fields_full = False
 3450                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3451                            annotation_fields = {
 3452                                key: key for key in db_hdr_vcf_header_infos
 3453                            }
 3454                            log.debug(
 3455                                "Annotation database header - All annotations added: "
 3456                                + str(annotation_fields)
 3457                            )
 3458                            annotation_fields_full = True
 3459
 3460                        # Init
 3461                        cyvcf2_header_rename_dict = {}
 3462                        cyvcf2_header_list = []
 3463                        cyvcf2_header_indexes = {}
 3464
 3465                        # process annotation fields
 3466                        for annotation_field in annotation_fields:
 3467
 3468                            # New annotation name
 3469                            annotation_field_new = annotation_fields[annotation_field]
 3470
 3471                            # Check annotation field and index in header
 3472                            if (
 3473                                annotation_field
 3474                                in db_hdr_vcf.get_header_columns_as_list()
 3475                            ):
 3476                                annotation_field_index = (
 3477                                    db_hdr_vcf.get_header_columns_as_list().index(
 3478                                        annotation_field
 3479                                    )
 3480                                    - 3
 3481                                )
 3482                                cyvcf2_header_indexes[annotation_field_new] = (
 3483                                    annotation_field_index
 3484                                )
 3485                            else:
 3486                                msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'"
 3487                                log.error(msg_err)
 3488                                raise ValueError(msg_err)
 3489
 3490                            # Append annotation field in cyvcf2 header list
 3491                            cyvcf2_header_rename_dict[annotation_field_new] = (
 3492                                db_hdr_vcf_header_infos[annotation_field].id
 3493                            )
 3494                            cyvcf2_header_list.append(
 3495                                {
 3496                                    "ID": annotation_field_new,
 3497                                    "Number": db_hdr_vcf_header_infos[
 3498                                        annotation_field
 3499                                    ].num,
 3500                                    "Type": db_hdr_vcf_header_infos[
 3501                                        annotation_field
 3502                                    ].type,
 3503                                    "Description": db_hdr_vcf_header_infos[
 3504                                        annotation_field
 3505                                    ].desc,
 3506                                }
 3507                            )
 3508
 3509                            # Add header on VCF
 3510                            vcf_reader.infos[annotation_field_new] = vcf.parser._Info(
 3511                                annotation_field_new,
 3512                                db_hdr_vcf_header_infos[annotation_field].num,
 3513                                db_hdr_vcf_header_infos[annotation_field].type,
 3514                                db_hdr_vcf_header_infos[annotation_field].desc,
 3515                                "HOWARD BigWig annotation",
 3516                                "unknown",
 3517                                self.code_type_map[
 3518                                    db_hdr_vcf_header_infos[annotation_field].type
 3519                                ],
 3520                            )
 3521
 3522                        # Load bigwig database
 3523                        bw_db = pyBigWig.open(db_file)
 3524                        if bw_db.isBigWig():
 3525                            log.debug(f"Database '{db_file}' is in 'BigWig' format")
 3526                        else:
 3527                            msg_err = f"Database '{db_file}' is NOT in 'BigWig' format"
 3528                            log.error(msg_err)
 3529                            raise ValueError(msg_err)
 3530
 3531                        annotation_bigwig_config_list.append(
 3532                            {
 3533                                "db_file": db_file,
 3534                                "bw_db": bw_db,
 3535                                "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict,
 3536                                "cyvcf2_header_list": cyvcf2_header_list,
 3537                                "cyvcf2_header_indexes": cyvcf2_header_indexes,
 3538                            }
 3539                        )
 3540
 3541                # Annotate
 3542                if annotation_bigwig_config_list:
 3543
 3544                    # Annotation config
 3545                    log.debug(
 3546                        f"annotation_bigwig_config={annotation_bigwig_config_list}"
 3547                    )
 3548
 3549                    # Export VCF file
 3550                    self.export_variant_vcf(
 3551                        vcf_file=tmp_vcf_name,
 3552                        remove_info=True,
 3553                        add_samples=False,
 3554                        index=True,
 3555                    )
 3556
 3557                    # Load input tmp file
 3558                    input_vcf = cyvcf2.VCF(tmp_vcf_name)
 3559
 3560                    # Add header in input file
 3561                    for annotation_bigwig_config in annotation_bigwig_config_list:
 3562                        for cyvcf2_header_field in annotation_bigwig_config.get(
 3563                            "cyvcf2_header_list", []
 3564                        ):
 3565                            log.info(
 3566                                f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'"
 3567                            )
 3568                            input_vcf.add_info_to_header(cyvcf2_header_field)
 3569
 3570                    # Create output VCF file
 3571                    output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz")
 3572                    output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf)
 3573
 3574                    # Fetch variants
 3575                    log.info(f"Annotations 'bigwig' start...")
 3576                    for variant in input_vcf:
 3577
 3578                        for annotation_bigwig_config in annotation_bigwig_config_list:
 3579
 3580                            # DB and indexes
 3581                            bw_db = annotation_bigwig_config.get("bw_db", None)
 3582                            cyvcf2_header_indexes = annotation_bigwig_config.get(
 3583                                "cyvcf2_header_indexes", None
 3584                            )
 3585
 3586                            # Retrieve value from chrom pos
 3587                            res = bw_db.values(
 3588                                variant.CHROM, variant.POS - 1, variant.POS
 3589                            )
 3590
 3591                            # For each annotation fields (and indexes)
 3592                            for cyvcf2_header_index in cyvcf2_header_indexes:
 3593
 3594                                # If value is NOT nNone
 3595                                if not np.isnan(
 3596                                    res[cyvcf2_header_indexes[cyvcf2_header_index]]
 3597                                ):
 3598                                    variant.INFO[cyvcf2_header_index] = res[
 3599                                        cyvcf2_header_indexes[cyvcf2_header_index]
 3600                                    ]
 3601
 3602                        # Add record in output file
 3603                        output_vcf.write_record(variant)
 3604
 3605                    # Log
 3606                    log.debug(f"Annotation done.")
 3607
 3608                    # Close and write file
 3609                    log.info(f"Annotations 'bigwig' write...")
 3610                    output_vcf.close()
 3611                    log.debug(f"Write done.")
 3612
 3613                    # Update variants
 3614                    log.info(f"Annotations 'bigwig' update...")
 3615                    self.update_from_vcf(output_vcf_file)
 3616                    log.debug(f"Update done.")
 3617
 3618        return True
 3619
 3620    def annotation_snpsift(self, threads: int = None) -> None:
 3621        """
 3622        This function annotate with bcftools
 3623
 3624        :param threads: Number of threads to use
 3625        :return: the value of the variable "return_value".
 3626        """
 3627
 3628        # DEBUG
 3629        log.debug("Start annotation with bcftools databases")
 3630
 3631        # Threads
 3632        if not threads:
 3633            threads = self.get_threads()
 3634        log.debug("Threads: " + str(threads))
 3635
 3636        # Config
 3637        config = self.get_config()
 3638        log.debug("Config: " + str(config))
 3639
 3640        # Config - snpSift
 3641        snpsift_bin_command = get_bin_command(
 3642            bin="SnpSift.jar",
 3643            tool="snpsift",
 3644            bin_type="jar",
 3645            config=config,
 3646            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 3647        )
 3648        if not snpsift_bin_command:
 3649            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
 3650            log.error(msg_err)
 3651            raise ValueError(msg_err)
 3652
 3653        # Config - bcftools
 3654        bcftools_bin_command = get_bin_command(
 3655            bin="bcftools",
 3656            tool="bcftools",
 3657            bin_type="bin",
 3658            config=config,
 3659            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3660        )
 3661        if not bcftools_bin_command:
 3662            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3663            log.error(msg_err)
 3664            raise ValueError(msg_err)
 3665
 3666        # Config - BCFTools databases folders
 3667        databases_folders = set(
 3668            self.get_config()
 3669            .get("folders", {})
 3670            .get("databases", {})
 3671            .get("annotations", ["."])
 3672            + self.get_config()
 3673            .get("folders", {})
 3674            .get("databases", {})
 3675            .get("bcftools", ["."])
 3676        )
 3677        log.debug("Databases annotations: " + str(databases_folders))
 3678
 3679        # Param
 3680        annotations = (
 3681            self.get_param()
 3682            .get("annotation", {})
 3683            .get("snpsift", {})
 3684            .get("annotations", None)
 3685        )
 3686        log.debug("Annotations: " + str(annotations))
 3687
 3688        # Assembly
 3689        assembly = self.get_param().get(
 3690            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3691        )
 3692
 3693        # Data
 3694        table_variants = self.get_table_variants()
 3695
 3696        # Check if not empty
 3697        log.debug("Check if not empty")
 3698        sql_query_chromosomes = (
 3699            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3700        )
 3701        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3702        if not sql_query_chromosomes_df["count"][0]:
 3703            log.info(f"VCF empty")
 3704            return
 3705
 3706        # VCF header
 3707        vcf_reader = self.get_header()
 3708        log.debug("Initial header: " + str(vcf_reader.infos))
 3709
 3710        # Existing annotations
 3711        for vcf_annotation in self.get_header().infos:
 3712
 3713            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3714            log.debug(
 3715                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3716            )
 3717
 3718        if annotations:
 3719
 3720            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3721
 3722                # Export VCF file
 3723                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3724
 3725                # Init
 3726                commands = {}
 3727
 3728                for annotation in annotations:
 3729                    annotation_fields = annotations[annotation]
 3730
 3731                    # Annotation Name
 3732                    annotation_name = os.path.basename(annotation)
 3733
 3734                    if not annotation_fields:
 3735                        annotation_fields = {"INFO": None}
 3736
 3737                    log.debug(f"Annotation '{annotation_name}'")
 3738                    log.debug(
 3739                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3740                    )
 3741
 3742                    # Create Database
 3743                    database = Database(
 3744                        database=annotation,
 3745                        databases_folders=databases_folders,
 3746                        assembly=assembly,
 3747                    )
 3748
 3749                    # Find files
 3750                    db_file = database.get_database()
 3751                    db_file = full_path(db_file)
 3752                    db_hdr_file = database.get_header_file()
 3753                    db_hdr_file = full_path(db_hdr_file)
 3754                    db_file_type = database.get_format()
 3755                    db_tbi_file = f"{db_file}.tbi"
 3756                    db_file_compressed = database.is_compressed()
 3757
 3758                    # Check if compressed
 3759                    if not db_file_compressed:
 3760                        log.error(
 3761                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3762                        )
 3763                        raise ValueError(
 3764                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3765                        )
 3766
 3767                    # Check if indexed
 3768                    if not os.path.exists(db_tbi_file):
 3769                        log.error(
 3770                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3771                        )
 3772                        raise ValueError(
 3773                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3774                        )
 3775
 3776                    # Check index - try to create if not exists
 3777                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3778                        log.error("Annotation failed: database not valid")
 3779                        log.error(f"Annotation annotation file: {db_file}")
 3780                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3781                        log.error(f"Annotation annotation index: {db_tbi_file}")
 3782                        raise ValueError(
 3783                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3784                        )
 3785                    else:
 3786
 3787                        log.debug(
 3788                            f"Annotation '{annotation}' - file: "
 3789                            + str(db_file)
 3790                            + " and "
 3791                            + str(db_hdr_file)
 3792                        )
 3793
 3794                        # Load header as VCF object
 3795                        db_hdr_vcf = Variants(input=db_hdr_file)
 3796                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3797                        log.debug(
 3798                            "Annotation database header: "
 3799                            + str(db_hdr_vcf_header_infos)
 3800                        )
 3801
 3802                        # For all fields in database
 3803                        annotation_fields_full = False
 3804                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3805                            annotation_fields = {
 3806                                key: key for key in db_hdr_vcf_header_infos
 3807                            }
 3808                            log.debug(
 3809                                "Annotation database header - All annotations added: "
 3810                                + str(annotation_fields)
 3811                            )
 3812                            annotation_fields_full = True
 3813
 3814                        # # Create file for field rename
 3815                        # log.debug("Create file for field rename")
 3816                        # tmp_rename = NamedTemporaryFile(
 3817                        #     prefix=self.get_prefix(),
 3818                        #     dir=self.get_tmp_dir(),
 3819                        #     suffix=".rename",
 3820                        #     delete=False,
 3821                        # )
 3822                        # tmp_rename_name = tmp_rename.name
 3823                        # tmp_files.append(tmp_rename_name)
 3824
 3825                        # Number of fields
 3826                        nb_annotation_field = 0
 3827                        annotation_list = []
 3828                        annotation_infos_rename_list = []
 3829
 3830                        for annotation_field in annotation_fields:
 3831
 3832                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3833                            annotation_fields_new_name = annotation_fields.get(
 3834                                annotation_field, annotation_field
 3835                            )
 3836                            if not annotation_fields_new_name:
 3837                                annotation_fields_new_name = annotation_field
 3838
 3839                            # Check if field is in DB and if field is not elready in input data
 3840                            if (
 3841                                annotation_field in db_hdr_vcf.get_header().infos
 3842                                and annotation_fields_new_name
 3843                                not in self.get_header().infos
 3844                            ):
 3845
 3846                                log.info(
 3847                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3848                                )
 3849
 3850                                # BCFTools annotate param to rename fields
 3851                                if annotation_field != annotation_fields_new_name:
 3852                                    annotation_infos_rename_list.append(
 3853                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3854                                    )
 3855
 3856                                # Add INFO field to header
 3857                                db_hdr_vcf_header_infos_number = (
 3858                                    db_hdr_vcf_header_infos[annotation_field].num or "."
 3859                                )
 3860                                db_hdr_vcf_header_infos_type = (
 3861                                    db_hdr_vcf_header_infos[annotation_field].type
 3862                                    or "String"
 3863                                )
 3864                                db_hdr_vcf_header_infos_description = (
 3865                                    db_hdr_vcf_header_infos[annotation_field].desc
 3866                                    or f"{annotation_field} description"
 3867                                )
 3868                                db_hdr_vcf_header_infos_source = (
 3869                                    db_hdr_vcf_header_infos[annotation_field].source
 3870                                    or "unknown"
 3871                                )
 3872                                db_hdr_vcf_header_infos_version = (
 3873                                    db_hdr_vcf_header_infos[annotation_field].version
 3874                                    or "unknown"
 3875                                )
 3876
 3877                                vcf_reader.infos[annotation_fields_new_name] = (
 3878                                    vcf.parser._Info(
 3879                                        annotation_fields_new_name,
 3880                                        db_hdr_vcf_header_infos_number,
 3881                                        db_hdr_vcf_header_infos_type,
 3882                                        db_hdr_vcf_header_infos_description,
 3883                                        db_hdr_vcf_header_infos_source,
 3884                                        db_hdr_vcf_header_infos_version,
 3885                                        self.code_type_map[
 3886                                            db_hdr_vcf_header_infos_type
 3887                                        ],
 3888                                    )
 3889                                )
 3890
 3891                                annotation_list.append(annotation_field)
 3892
 3893                                nb_annotation_field += 1
 3894
 3895                            else:
 3896
 3897                                if (
 3898                                    annotation_field
 3899                                    not in db_hdr_vcf.get_header().infos
 3900                                ):
 3901                                    log.warning(
 3902                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
 3903                                    )
 3904                                if (
 3905                                    annotation_fields_new_name
 3906                                    in self.get_header().infos
 3907                                ):
 3908                                    log.warning(
 3909                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3910                                    )
 3911
 3912                        log.info(
 3913                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3914                        )
 3915
 3916                        annotation_infos = ",".join(annotation_list)
 3917
 3918                        if annotation_infos != "":
 3919
 3920                            # Annotated VCF (and error file)
 3921                            tmp_annotation_vcf_name = os.path.join(
 3922                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
 3923                            )
 3924                            tmp_annotation_vcf_name_err = (
 3925                                tmp_annotation_vcf_name + ".err"
 3926                            )
 3927
 3928                            # Add fields to annotate
 3929                            if not annotation_fields_full:
 3930                                annotation_infos_option = f"-info {annotation_infos}"
 3931                            else:
 3932                                annotation_infos_option = ""
 3933
 3934                            # Info fields rename
 3935                            if annotation_infos_rename_list:
 3936                                annotation_infos_rename = " -c " + ",".join(
 3937                                    annotation_infos_rename_list
 3938                                )
 3939                            else:
 3940                                annotation_infos_rename = ""
 3941
 3942                            # Annotate command
 3943                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3944
 3945                            # Add command
 3946                            commands[command_annotate] = tmp_annotation_vcf_name
 3947
 3948                if commands:
 3949
 3950                    # Export VCF file
 3951                    self.export_variant_vcf(
 3952                        vcf_file=tmp_vcf_name,
 3953                        remove_info=True,
 3954                        add_samples=False,
 3955                        index=True,
 3956                    )
 3957                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
 3958
 3959                    # Num command
 3960                    nb_command = 0
 3961
 3962                    # Annotate
 3963                    for command_annotate in commands:
 3964                        nb_command += 1
 3965                        log.info(
 3966                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
 3967                        )
 3968                        log.debug(f"command_annotate={command_annotate}")
 3969                        run_parallel_commands([command_annotate], threads)
 3970
 3971                        # Debug
 3972                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
 3973
 3974                        # Update variants
 3975                        log.info(
 3976                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
 3977                        )
 3978                        self.update_from_vcf(commands[command_annotate])
 3979
 3980    def annotation_bcftools(self, threads: int = None) -> None:
 3981        """
 3982        This function annotate with bcftools
 3983
 3984        :param threads: Number of threads to use
 3985        :return: the value of the variable "return_value".
 3986        """
 3987
 3988        # DEBUG
 3989        log.debug("Start annotation with bcftools databases")
 3990
 3991        # Threads
 3992        if not threads:
 3993            threads = self.get_threads()
 3994        log.debug("Threads: " + str(threads))
 3995
 3996        # Config
 3997        config = self.get_config()
 3998        log.debug("Config: " + str(config))
 3999
 4000        # DEBUG
 4001        delete_tmp = True
 4002        if self.get_config().get("verbosity", "warning") in ["debug"]:
 4003            delete_tmp = False
 4004            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 4005
 4006        # Config - BCFTools bin command
 4007        bcftools_bin_command = get_bin_command(
 4008            bin="bcftools",
 4009            tool="bcftools",
 4010            bin_type="bin",
 4011            config=config,
 4012            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 4013        )
 4014        if not bcftools_bin_command:
 4015            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 4016            log.error(msg_err)
 4017            raise ValueError(msg_err)
 4018
 4019        # Config - BCFTools databases folders
 4020        databases_folders = set(
 4021            self.get_config()
 4022            .get("folders", {})
 4023            .get("databases", {})
 4024            .get("annotations", ["."])
 4025            + self.get_config()
 4026            .get("folders", {})
 4027            .get("databases", {})
 4028            .get("bcftools", ["."])
 4029        )
 4030        log.debug("Databases annotations: " + str(databases_folders))
 4031
 4032        # Param
 4033        annotations = (
 4034            self.get_param()
 4035            .get("annotation", {})
 4036            .get("bcftools", {})
 4037            .get("annotations", None)
 4038        )
 4039        log.debug("Annotations: " + str(annotations))
 4040
 4041        # Assembly
 4042        assembly = self.get_param().get(
 4043            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 4044        )
 4045
 4046        # Data
 4047        table_variants = self.get_table_variants()
 4048
 4049        # Check if not empty
 4050        log.debug("Check if not empty")
 4051        sql_query_chromosomes = (
 4052            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4053        )
 4054        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 4055        if not sql_query_chromosomes_df["count"][0]:
 4056            log.info(f"VCF empty")
 4057            return
 4058
 4059        # Export in VCF
 4060        log.debug("Create initial file to annotate")
 4061        tmp_vcf = NamedTemporaryFile(
 4062            prefix=self.get_prefix(),
 4063            dir=self.get_tmp_dir(),
 4064            suffix=".vcf.gz",
 4065            delete=False,
 4066        )
 4067        tmp_vcf_name = tmp_vcf.name
 4068
 4069        # VCF header
 4070        vcf_reader = self.get_header()
 4071        log.debug("Initial header: " + str(vcf_reader.infos))
 4072
 4073        # Existing annotations
 4074        for vcf_annotation in self.get_header().infos:
 4075
 4076            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 4077            log.debug(
 4078                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 4079            )
 4080
 4081        if annotations:
 4082
 4083            tmp_ann_vcf_list = []
 4084            commands = []
 4085            tmp_files = []
 4086            err_files = []
 4087
 4088            for annotation in annotations:
 4089                annotation_fields = annotations[annotation]
 4090
 4091                # Annotation Name
 4092                annotation_name = os.path.basename(annotation)
 4093
 4094                if not annotation_fields:
 4095                    annotation_fields = {"INFO": None}
 4096
 4097                log.debug(f"Annotation '{annotation_name}'")
 4098                log.debug(
 4099                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 4100                )
 4101
 4102                # Create Database
 4103                database = Database(
 4104                    database=annotation,
 4105                    databases_folders=databases_folders,
 4106                    assembly=assembly,
 4107                )
 4108
 4109                # Find files
 4110                db_file = database.get_database()
 4111                db_file = full_path(db_file)
 4112                db_hdr_file = database.get_header_file()
 4113                db_hdr_file = full_path(db_hdr_file)
 4114                db_file_type = database.get_format()
 4115                db_tbi_file = f"{db_file}.tbi"
 4116                db_file_compressed = database.is_compressed()
 4117
 4118                # Check if compressed
 4119                if not db_file_compressed:
 4120                    log.error(
 4121                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4122                    )
 4123                    raise ValueError(
 4124                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4125                    )
 4126
 4127                # Check if indexed
 4128                if not os.path.exists(db_tbi_file):
 4129                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
 4130                    raise ValueError(
 4131                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
 4132                    )
 4133
 4134                # Check index - try to create if not exists
 4135                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 4136                    log.error("Annotation failed: database not valid")
 4137                    log.error(f"Annotation annotation file: {db_file}")
 4138                    log.error(f"Annotation annotation header: {db_hdr_file}")
 4139                    log.error(f"Annotation annotation index: {db_tbi_file}")
 4140                    raise ValueError(
 4141                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 4142                    )
 4143                else:
 4144
 4145                    log.debug(
 4146                        f"Annotation '{annotation}' - file: "
 4147                        + str(db_file)
 4148                        + " and "
 4149                        + str(db_hdr_file)
 4150                    )
 4151
 4152                    # Load header as VCF object
 4153                    db_hdr_vcf = Variants(input=db_hdr_file)
 4154                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 4155                    log.debug(
 4156                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
 4157                    )
 4158
 4159                    # For all fields in database
 4160                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 4161                        annotation_fields = {
 4162                            key: key for key in db_hdr_vcf_header_infos
 4163                        }
 4164                        log.debug(
 4165                            "Annotation database header - All annotations added: "
 4166                            + str(annotation_fields)
 4167                        )
 4168
 4169                    # Number of fields
 4170                    nb_annotation_field = 0
 4171                    annotation_list = []
 4172
 4173                    for annotation_field in annotation_fields:
 4174
 4175                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 4176                        annotation_fields_new_name = annotation_fields.get(
 4177                            annotation_field, annotation_field
 4178                        )
 4179                        if not annotation_fields_new_name:
 4180                            annotation_fields_new_name = annotation_field
 4181
 4182                        # Check if field is in DB and if field is not elready in input data
 4183                        if (
 4184                            annotation_field in db_hdr_vcf.get_header().infos
 4185                            and annotation_fields_new_name
 4186                            not in self.get_header().infos
 4187                        ):
 4188
 4189                            log.info(
 4190                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 4191                            )
 4192
 4193                            # Add INFO field to header
 4194                            db_hdr_vcf_header_infos_number = (
 4195                                db_hdr_vcf_header_infos[annotation_field].num or "."
 4196                            )
 4197                            db_hdr_vcf_header_infos_type = (
 4198                                db_hdr_vcf_header_infos[annotation_field].type
 4199                                or "String"
 4200                            )
 4201                            db_hdr_vcf_header_infos_description = (
 4202                                db_hdr_vcf_header_infos[annotation_field].desc
 4203                                or f"{annotation_field} description"
 4204                            )
 4205                            db_hdr_vcf_header_infos_source = (
 4206                                db_hdr_vcf_header_infos[annotation_field].source
 4207                                or "unknown"
 4208                            )
 4209                            db_hdr_vcf_header_infos_version = (
 4210                                db_hdr_vcf_header_infos[annotation_field].version
 4211                                or "unknown"
 4212                            )
 4213
 4214                            vcf_reader.infos[annotation_fields_new_name] = (
 4215                                vcf.parser._Info(
 4216                                    annotation_fields_new_name,
 4217                                    db_hdr_vcf_header_infos_number,
 4218                                    db_hdr_vcf_header_infos_type,
 4219                                    db_hdr_vcf_header_infos_description,
 4220                                    db_hdr_vcf_header_infos_source,
 4221                                    db_hdr_vcf_header_infos_version,
 4222                                    self.code_type_map[db_hdr_vcf_header_infos_type],
 4223                                )
 4224                            )
 4225
 4226                            # annotation_list.append(annotation_field)
 4227                            if annotation_field != annotation_fields_new_name:
 4228                                annotation_list.append(
 4229                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 4230                                )
 4231                            else:
 4232                                annotation_list.append(annotation_field)
 4233
 4234                            nb_annotation_field += 1
 4235
 4236                        else:
 4237
 4238                            if annotation_field not in db_hdr_vcf.get_header().infos:
 4239                                log.warning(
 4240                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
 4241                                )
 4242                            if annotation_fields_new_name in self.get_header().infos:
 4243                                log.warning(
 4244                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 4245                                )
 4246
 4247                    log.info(
 4248                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 4249                    )
 4250
 4251                    annotation_infos = ",".join(annotation_list)
 4252
 4253                    if annotation_infos != "":
 4254
 4255                        # Protect header for bcftools (remove "#CHROM" and variants line)
 4256                        log.debug("Protect Header file - remove #CHROM line if exists")
 4257                        tmp_header_vcf = NamedTemporaryFile(
 4258                            prefix=self.get_prefix(),
 4259                            dir=self.get_tmp_dir(),
 4260                            suffix=".hdr",
 4261                            delete=False,
 4262                        )
 4263                        tmp_header_vcf_name = tmp_header_vcf.name
 4264                        tmp_files.append(tmp_header_vcf_name)
 4265                        # Command
 4266                        if db_hdr_file.endswith(".gz"):
 4267                            command_extract_header = f"zcat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4268                        else:
 4269                            command_extract_header = f"cat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4270                        # Run
 4271                        run_parallel_commands([command_extract_header], 1)
 4272
 4273                        # Find chomosomes
 4274                        log.debug("Find chromosomes ")
 4275                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
 4276                        sql_query_chromosomes_df = self.get_query_to_df(
 4277                            sql_query_chromosomes
 4278                        )
 4279                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
 4280
 4281                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
 4282
 4283                        # BED columns in the annotation file
 4284                        if db_file_type in ["bed"]:
 4285                            annotation_infos = "CHROM,POS,POS," + annotation_infos
 4286
 4287                        for chrom in chomosomes_list:
 4288
 4289                            # Create BED on initial VCF
 4290                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
 4291                            tmp_bed = NamedTemporaryFile(
 4292                                prefix=self.get_prefix(),
 4293                                dir=self.get_tmp_dir(),
 4294                                suffix=".bed",
 4295                                delete=False,
 4296                            )
 4297                            tmp_bed_name = tmp_bed.name
 4298                            tmp_files.append(tmp_bed_name)
 4299
 4300                            # Detecte regions
 4301                            log.debug(
 4302                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
 4303                            )
 4304                            window = 1000000
 4305                            sql_query_intervals_for_bed = f"""
 4306                                SELECT  \"#CHROM\",
 4307                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
 4308                                        \"POS\"+{window}
 4309                                FROM {table_variants} as table_variants
 4310                                WHERE table_variants.\"#CHROM\" = '{chrom}'
 4311                            """
 4312                            regions = self.conn.execute(
 4313                                sql_query_intervals_for_bed
 4314                            ).fetchall()
 4315                            merged_regions = merge_regions(regions)
 4316                            log.debug(
 4317                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
 4318                            )
 4319
 4320                            header = ["#CHROM", "START", "END"]
 4321                            with open(tmp_bed_name, "w") as f:
 4322                                # Write the header with tab delimiter
 4323                                f.write("\t".join(header) + "\n")
 4324                                for d in merged_regions:
 4325                                    # Write each data row with tab delimiter
 4326                                    f.write("\t".join(map(str, d)) + "\n")
 4327
 4328                            # Tmp files
 4329                            tmp_annotation_vcf = NamedTemporaryFile(
 4330                                prefix=self.get_prefix(),
 4331                                dir=self.get_tmp_dir(),
 4332                                suffix=".vcf.gz",
 4333                                delete=False,
 4334                            )
 4335                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
 4336                            tmp_files.append(tmp_annotation_vcf_name)
 4337                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
 4338                            tmp_annotation_vcf_name_err = (
 4339                                tmp_annotation_vcf_name + ".err"
 4340                            )
 4341                            err_files.append(tmp_annotation_vcf_name_err)
 4342
 4343                            # Annotate Command
 4344                            log.debug(
 4345                                f"Annotation '{annotation}' - add bcftools command"
 4346                            )
 4347
 4348                            # Command
 4349                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 4350
 4351                            # Add command
 4352                            commands.append(command_annotate)
 4353
 4354            # if some commands
 4355            if commands:
 4356
 4357                # Export VCF file
 4358                self.export_variant_vcf(
 4359                    vcf_file=tmp_vcf_name,
 4360                    remove_info=True,
 4361                    add_samples=False,
 4362                    index=True,
 4363                )
 4364
 4365                # Threads
 4366                # calculate threads for annotated commands
 4367                if commands:
 4368                    threads_bcftools_annotate = round(threads / len(commands))
 4369                else:
 4370                    threads_bcftools_annotate = 1
 4371
 4372                if not threads_bcftools_annotate:
 4373                    threads_bcftools_annotate = 1
 4374
 4375                # Add threads option to bcftools commands
 4376                if threads_bcftools_annotate > 1:
 4377                    commands_threaded = []
 4378                    for command in commands:
 4379                        commands_threaded.append(
 4380                            command.replace(
 4381                                f"{bcftools_bin_command} annotate ",
 4382                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
 4383                            )
 4384                        )
 4385                    commands = commands_threaded
 4386
 4387                # Command annotation multithreading
 4388                log.debug(f"Annotation - Annotation commands: " + str(commands))
 4389                log.info(
 4390                    f"Annotation - Annotation multithreaded in "
 4391                    + str(len(commands))
 4392                    + " commands"
 4393                )
 4394
 4395                run_parallel_commands(commands, threads)
 4396
 4397                # Merge
 4398                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
 4399
 4400                if tmp_ann_vcf_list_cmd:
 4401
 4402                    # Tmp file
 4403                    tmp_annotate_vcf = NamedTemporaryFile(
 4404                        prefix=self.get_prefix(),
 4405                        dir=self.get_tmp_dir(),
 4406                        suffix=".vcf.gz",
 4407                        delete=True,
 4408                    )
 4409                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
 4410                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 4411                    err_files.append(tmp_annotate_vcf_name_err)
 4412
 4413                    # Tmp file remove command
 4414                    tmp_files_remove_command = ""
 4415                    if tmp_files:
 4416                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
 4417
 4418                    # Command merge
 4419                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
 4420                    log.info(
 4421                        f"Annotation - Annotation merging "
 4422                        + str(len(commands))
 4423                        + " annotated files"
 4424                    )
 4425                    log.debug(f"Annotation - merge command: {merge_command}")
 4426                    run_parallel_commands([merge_command], 1)
 4427
 4428                    # Error messages
 4429                    log.info(f"Error/Warning messages:")
 4430                    error_message_command_all = []
 4431                    error_message_command_warning = []
 4432                    error_message_command_err = []
 4433                    for err_file in err_files:
 4434                        with open(err_file, "r") as f:
 4435                            for line in f:
 4436                                message = line.strip()
 4437                                error_message_command_all.append(message)
 4438                                if line.startswith("[W::"):
 4439                                    error_message_command_warning.append(message)
 4440                                if line.startswith("[E::"):
 4441                                    error_message_command_err.append(
 4442                                        f"{err_file}: " + message
 4443                                    )
 4444                    # log info
 4445                    for message in list(
 4446                        set(error_message_command_err + error_message_command_warning)
 4447                    ):
 4448                        log.info(f"   {message}")
 4449                    # debug info
 4450                    for message in list(set(error_message_command_all)):
 4451                        log.debug(f"   {message}")
 4452                    # failed
 4453                    if len(error_message_command_err):
 4454                        log.error("Annotation failed: Error in commands")
 4455                        raise ValueError("Annotation failed: Error in commands")
 4456
 4457                    # Update variants
 4458                    log.info(f"Annotation - Updating...")
 4459                    self.update_from_vcf(tmp_annotate_vcf_name)
 4460
 4461    def annotation_exomiser(self, threads: int = None) -> None:
 4462        """
 4463        This function annotate with Exomiser
 4464
 4465        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
 4466        - "analysis" (dict/file):
 4467            Full analysis dictionnary parameters (see Exomiser docs).
 4468            Either a dict, or a file in JSON or YAML format.
 4469            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
 4470            Default : None
 4471        - "preset" (string):
 4472            Analysis preset (available in config folder).
 4473            Used if no full "analysis" is provided.
 4474            Default: "exome"
 4475        - "phenopacket" (dict/file):
 4476            Samples and phenotipic features parameters (see Exomiser docs).
 4477            Either a dict, or a file in JSON or YAML format.
 4478            Default: None
 4479        - "subject" (dict):
 4480            Sample parameters (see Exomiser docs).
 4481            Example:
 4482                "subject":
 4483                    {
 4484                        "id": "ISDBM322017",
 4485                        "sex": "FEMALE"
 4486                    }
 4487            Default: None
 4488        - "sample" (string):
 4489            Sample name to construct "subject" section:
 4490                "subject":
 4491                    {
 4492                        "id": "<sample>",
 4493                        "sex": "UNKNOWN_SEX"
 4494                    }
 4495            Default: None
 4496        - "phenotypicFeatures" (dict)
 4497            Phenotypic features to construct "subject" section.
 4498            Example:
 4499                "phenotypicFeatures":
 4500                    [
 4501                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
 4502                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
 4503                    ]
 4504        - "hpo" (list)
 4505            List of HPO ids as phenotypic features.
 4506            Example:
 4507                "hpo": ['0001156', '0001363', '0011304', '0010055']
 4508            Default: []
 4509        - "outputOptions" (dict):
 4510            Output options (see Exomiser docs).
 4511            Default:
 4512                "output_options" =
 4513                    {
 4514                        "outputContributingVariantsOnly": False,
 4515                        "numGenes": 0,
 4516                        "outputFormats": ["TSV_VARIANT", "VCF"]
 4517                    }
 4518        - "transcript_source" (string):
 4519            Transcript source (either "refseq", "ucsc", "ensembl")
 4520            Default: "refseq"
 4521        - "exomiser_to_info" (boolean):
 4522            Add exomiser TSV file columns as INFO fields in VCF.
 4523            Default: False
 4524        - "release" (string):
 4525            Exomise database release.
 4526            If not exists, database release will be downloaded (take a while).
 4527            Default: None (provided by application.properties configuration file)
 4528        - "exomiser_application_properties" (file):
 4529            Exomiser configuration file (see Exomiser docs).
 4530            Useful to automatically download databases (especially for specific genome databases).
 4531
 4532        Notes:
 4533        - If no sample in parameters, first sample in VCF will be chosen
 4534        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
 4535
 4536        :param threads: The number of threads to use
 4537        :return: None.
 4538        """
 4539
 4540        # DEBUG
 4541        log.debug("Start annotation with Exomiser databases")
 4542
 4543        # Threads
 4544        if not threads:
 4545            threads = self.get_threads()
 4546        log.debug("Threads: " + str(threads))
 4547
 4548        # Config
 4549        config = self.get_config()
 4550        log.debug("Config: " + str(config))
 4551
 4552        # Config - Folders - Databases
 4553        databases_folders = (
 4554            config.get("folders", {})
 4555            .get("databases", {})
 4556            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
 4557        )
 4558        databases_folders = full_path(databases_folders)
 4559        if not os.path.exists(databases_folders):
 4560            log.error(f"Databases annotations: {databases_folders} NOT found")
 4561        log.debug("Databases annotations: " + str(databases_folders))
 4562
 4563        # Config - Exomiser
 4564        exomiser_bin_command = get_bin_command(
 4565            bin="exomiser-cli*.jar",
 4566            tool="exomiser",
 4567            bin_type="jar",
 4568            config=config,
 4569            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
 4570        )
 4571        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
 4572        if not exomiser_bin_command:
 4573            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
 4574            log.error(msg_err)
 4575            raise ValueError(msg_err)
 4576
 4577        # Param
 4578        param = self.get_param()
 4579        log.debug("Param: " + str(param))
 4580
 4581        # Param - Exomiser
 4582        param_exomiser = param.get("annotation", {}).get("exomiser", {})
 4583        log.debug(f"Param Exomiser: {param_exomiser}")
 4584
 4585        # Param - Assembly
 4586        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4587        log.debug("Assembly: " + str(assembly))
 4588
 4589        # Data
 4590        table_variants = self.get_table_variants()
 4591
 4592        # Check if not empty
 4593        log.debug("Check if not empty")
 4594        sql_query_chromosomes = (
 4595            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4596        )
 4597        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4598            log.info(f"VCF empty")
 4599            return False
 4600
 4601        # VCF header
 4602        vcf_reader = self.get_header()
 4603        log.debug("Initial header: " + str(vcf_reader.infos))
 4604
 4605        # Samples
 4606        samples = self.get_header_sample_list()
 4607        if not samples:
 4608            log.error("No Samples in VCF")
 4609            return False
 4610        log.debug(f"Samples: {samples}")
 4611
 4612        # Memory limit
 4613        memory_limit = self.get_memory("8G")
 4614        log.debug(f"memory_limit: {memory_limit}")
 4615
 4616        # Exomiser java options
 4617        exomiser_java_options = (
 4618            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4619        )
 4620        log.debug(f"Exomiser java options: {exomiser_java_options}")
 4621
 4622        # Download Exomiser (if not exists)
 4623        exomiser_release = param_exomiser.get("release", None)
 4624        exomiser_application_properties = param_exomiser.get(
 4625            "exomiser_application_properties", None
 4626        )
 4627        databases_download_exomiser(
 4628            assemblies=[assembly],
 4629            exomiser_folder=databases_folders,
 4630            exomiser_release=exomiser_release,
 4631            exomiser_phenotype_release=exomiser_release,
 4632            exomiser_application_properties=exomiser_application_properties,
 4633        )
 4634
 4635        # Force annotation
 4636        force_update_annotation = True
 4637
 4638        if "Exomiser" not in self.get_header().infos or force_update_annotation:
 4639            log.debug("Start annotation Exomiser")
 4640
 4641            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 4642
 4643                # tmp_dir = "/tmp/exomiser"
 4644
 4645                ### ANALYSIS ###
 4646                ################
 4647
 4648                # Create analysis.json through analysis dict
 4649                # either analysis in param or by default
 4650                # depending on preset exome/genome)
 4651
 4652                # Init analysis dict
 4653                param_exomiser_analysis_dict = {}
 4654
 4655                # analysis from param
 4656                param_exomiser_analysis = param_exomiser.get("analysis", {})
 4657                param_exomiser_analysis = full_path(param_exomiser_analysis)
 4658
 4659                # If analysis in param -> load anlaysis json
 4660                if param_exomiser_analysis:
 4661
 4662                    # If param analysis is a file and exists
 4663                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
 4664                        param_exomiser_analysis
 4665                    ):
 4666                        # Load analysis file into analysis dict (either yaml or json)
 4667                        with open(param_exomiser_analysis) as json_file:
 4668                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
 4669
 4670                    # If param analysis is a dict
 4671                    elif isinstance(param_exomiser_analysis, dict):
 4672                        # Load analysis dict into analysis dict (either yaml or json)
 4673                        param_exomiser_analysis_dict = param_exomiser_analysis
 4674
 4675                    # Error analysis type
 4676                    else:
 4677                        log.error(f"Analysis type unknown. Check param file.")
 4678                        raise ValueError(f"Analysis type unknown. Check param file.")
 4679
 4680                # Case no input analysis config file/dict
 4681                # Use preset (exome/genome) to open default config file
 4682                if not param_exomiser_analysis_dict:
 4683
 4684                    # default preset
 4685                    default_preset = "exome"
 4686
 4687                    # Get param preset or default preset
 4688                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
 4689
 4690                    # Try to find if preset is a file
 4691                    if os.path.exists(param_exomiser_preset):
 4692                        # Preset file is provided in full path
 4693                        param_exomiser_analysis_default_config_file = (
 4694                            param_exomiser_preset
 4695                        )
 4696                    # elif os.path.exists(full_path(param_exomiser_preset)):
 4697                    #     # Preset file is provided in full path
 4698                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
 4699                    elif os.path.exists(
 4700                        os.path.join(folder_config, param_exomiser_preset)
 4701                    ):
 4702                        # Preset file is provided a basename in config folder (can be a path with subfolders)
 4703                        param_exomiser_analysis_default_config_file = os.path.join(
 4704                            folder_config, param_exomiser_preset
 4705                        )
 4706                    else:
 4707                        # Construct preset file
 4708                        param_exomiser_analysis_default_config_file = os.path.join(
 4709                            folder_config,
 4710                            f"preset-{param_exomiser_preset}-analysis.json",
 4711                        )
 4712
 4713                    # If preset file exists
 4714                    param_exomiser_analysis_default_config_file = full_path(
 4715                        param_exomiser_analysis_default_config_file
 4716                    )
 4717                    if os.path.exists(param_exomiser_analysis_default_config_file):
 4718                        # Load prest file into analysis dict (either yaml or json)
 4719                        with open(
 4720                            param_exomiser_analysis_default_config_file
 4721                        ) as json_file:
 4722                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
 4723                                json_file
 4724                            )
 4725
 4726                    # Error preset file
 4727                    else:
 4728                        log.error(
 4729                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4730                        )
 4731                        raise ValueError(
 4732                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4733                        )
 4734
 4735                # If no analysis dict created
 4736                if not param_exomiser_analysis_dict:
 4737                    log.error(f"No analysis config")
 4738                    raise ValueError(f"No analysis config")
 4739
 4740                # Log
 4741                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4742
 4743                ### PHENOPACKET ###
 4744                ###################
 4745
 4746                # If no PhenoPacket in analysis dict -> check in param
 4747                if "phenopacket" not in param_exomiser_analysis_dict:
 4748
 4749                    # If PhenoPacket in param -> load anlaysis json
 4750                    if param_exomiser.get("phenopacket", None):
 4751
 4752                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
 4753                        param_exomiser_phenopacket = full_path(
 4754                            param_exomiser_phenopacket
 4755                        )
 4756
 4757                        # If param phenopacket is a file and exists
 4758                        if isinstance(
 4759                            param_exomiser_phenopacket, str
 4760                        ) and os.path.exists(param_exomiser_phenopacket):
 4761                            # Load phenopacket file into analysis dict (either yaml or json)
 4762                            with open(param_exomiser_phenopacket) as json_file:
 4763                                param_exomiser_analysis_dict["phenopacket"] = (
 4764                                    yaml.safe_load(json_file)
 4765                                )
 4766
 4767                        # If param phenopacket is a dict
 4768                        elif isinstance(param_exomiser_phenopacket, dict):
 4769                            # Load phenopacket dict into analysis dict (either yaml or json)
 4770                            param_exomiser_analysis_dict["phenopacket"] = (
 4771                                param_exomiser_phenopacket
 4772                            )
 4773
 4774                        # Error phenopacket type
 4775                        else:
 4776                            log.error(f"Phenopacket type unknown. Check param file.")
 4777                            raise ValueError(
 4778                                f"Phenopacket type unknown. Check param file."
 4779                            )
 4780
 4781                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
 4782                if "phenopacket" not in param_exomiser_analysis_dict:
 4783
 4784                    # Init PhenoPacket
 4785                    param_exomiser_analysis_dict["phenopacket"] = {
 4786                        "id": "analysis",
 4787                        "proband": {},
 4788                    }
 4789
 4790                    ### Add subject ###
 4791
 4792                    # If subject exists
 4793                    param_exomiser_subject = param_exomiser.get("subject", {})
 4794
 4795                    # If subject not exists -> found sample ID
 4796                    if not param_exomiser_subject:
 4797
 4798                        # Found sample ID in param
 4799                        sample = param_exomiser.get("sample", None)
 4800
 4801                        # Find sample ID (first sample)
 4802                        if not sample:
 4803                            sample_list = self.get_header_sample_list()
 4804                            if len(sample_list) > 0:
 4805                                sample = sample_list[0]
 4806                            else:
 4807                                log.error(f"No sample found")
 4808                                raise ValueError(f"No sample found")
 4809
 4810                        # Create subject
 4811                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
 4812
 4813                    # Add to dict
 4814                    param_exomiser_analysis_dict["phenopacket"][
 4815                        "subject"
 4816                    ] = param_exomiser_subject
 4817
 4818                    ### Add "phenotypicFeatures" ###
 4819
 4820                    # If phenotypicFeatures exists
 4821                    param_exomiser_phenotypicfeatures = param_exomiser.get(
 4822                        "phenotypicFeatures", []
 4823                    )
 4824
 4825                    # If phenotypicFeatures not exists -> Try to infer from hpo list
 4826                    if not param_exomiser_phenotypicfeatures:
 4827
 4828                        # Found HPO in param
 4829                        param_exomiser_hpo = param_exomiser.get("hpo", [])
 4830
 4831                        # Split HPO if list in string format separated by comma
 4832                        if isinstance(param_exomiser_hpo, str):
 4833                            param_exomiser_hpo = param_exomiser_hpo.split(",")
 4834
 4835                        # Create HPO list
 4836                        for hpo in param_exomiser_hpo:
 4837                            hpo_clean = re.sub("[^0-9]", "", hpo)
 4838                            param_exomiser_phenotypicfeatures.append(
 4839                                {
 4840                                    "type": {
 4841                                        "id": f"HP:{hpo_clean}",
 4842                                        "label": f"HP:{hpo_clean}",
 4843                                    }
 4844                                }
 4845                            )
 4846
 4847                    # Add to dict
 4848                    param_exomiser_analysis_dict["phenopacket"][
 4849                        "phenotypicFeatures"
 4850                    ] = param_exomiser_phenotypicfeatures
 4851
 4852                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
 4853                    if not param_exomiser_phenotypicfeatures:
 4854                        for step in param_exomiser_analysis_dict.get(
 4855                            "analysis", {}
 4856                        ).get("steps", []):
 4857                            if "hiPhivePrioritiser" in step:
 4858                                param_exomiser_analysis_dict.get("analysis", {}).get(
 4859                                    "steps", []
 4860                                ).remove(step)
 4861
 4862                ### Add Input File ###
 4863
 4864                # Initial file name and htsFiles
 4865                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
 4866                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
 4867                    {
 4868                        "uri": tmp_vcf_name,
 4869                        "htsFormat": "VCF",
 4870                        "genomeAssembly": assembly,
 4871                    }
 4872                ]
 4873
 4874                ### Add metaData ###
 4875
 4876                # If metaData not in analysis dict
 4877                if "metaData" not in param_exomiser_analysis_dict:
 4878                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
 4879                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
 4880                        "createdBy": "howard",
 4881                        "phenopacketSchemaVersion": 1,
 4882                    }
 4883
 4884                ### OutputOptions ###
 4885
 4886                # Init output result folder
 4887                output_results = os.path.join(tmp_dir, "results")
 4888
 4889                # If no outputOptions in analysis dict
 4890                if "outputOptions" not in param_exomiser_analysis_dict:
 4891
 4892                    # default output formats
 4893                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
 4894
 4895                    # Get outputOptions in param
 4896                    output_options = param_exomiser.get("outputOptions", None)
 4897
 4898                    # If no output_options in param -> check
 4899                    if not output_options:
 4900                        output_options = {
 4901                            "outputContributingVariantsOnly": False,
 4902                            "numGenes": 0,
 4903                            "outputFormats": defaut_output_formats,
 4904                        }
 4905
 4906                    # Replace outputDirectory in output options
 4907                    output_options["outputDirectory"] = output_results
 4908                    output_options["outputFileName"] = "howard"
 4909
 4910                    # Add outputOptions in analysis dict
 4911                    param_exomiser_analysis_dict["outputOptions"] = output_options
 4912
 4913                else:
 4914
 4915                    # Replace output_results and output format (if exists in param)
 4916                    param_exomiser_analysis_dict["outputOptions"][
 4917                        "outputDirectory"
 4918                    ] = output_results
 4919                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
 4920                        list(
 4921                            set(
 4922                                param_exomiser_analysis_dict.get(
 4923                                    "outputOptions", {}
 4924                                ).get("outputFormats", [])
 4925                                + ["TSV_VARIANT", "VCF"]
 4926                            )
 4927                        )
 4928                    )
 4929
 4930                # log
 4931                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4932
 4933                ### ANALYSIS FILE ###
 4934                #####################
 4935
 4936                ### Full JSON analysis config file ###
 4937
 4938                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
 4939                with open(exomiser_analysis, "w") as fp:
 4940                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
 4941
 4942                ### SPLIT analysis and sample config files
 4943
 4944                # Splitted analysis dict
 4945                param_exomiser_analysis_dict_for_split = (
 4946                    param_exomiser_analysis_dict.copy()
 4947                )
 4948
 4949                # Phenopacket JSON file
 4950                exomiser_analysis_phenopacket = os.path.join(
 4951                    tmp_dir, "analysis_phenopacket.json"
 4952                )
 4953                with open(exomiser_analysis_phenopacket, "w") as fp:
 4954                    json.dump(
 4955                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
 4956                        fp,
 4957                        indent=4,
 4958                    )
 4959
 4960                # Analysis JSON file without Phenopacket parameters
 4961                param_exomiser_analysis_dict_for_split.pop("phenopacket")
 4962                exomiser_analysis_analysis = os.path.join(
 4963                    tmp_dir, "analysis_analysis.json"
 4964                )
 4965                with open(exomiser_analysis_analysis, "w") as fp:
 4966                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
 4967
 4968                ### INITAL VCF file ###
 4969                #######################
 4970
 4971                ### Create list of samples to use and include inti initial VCF file ####
 4972
 4973                # Subject (main sample)
 4974                # Get sample ID in analysis dict
 4975                sample_subject = (
 4976                    param_exomiser_analysis_dict.get("phenopacket", {})
 4977                    .get("subject", {})
 4978                    .get("id", None)
 4979                )
 4980                sample_proband = (
 4981                    param_exomiser_analysis_dict.get("phenopacket", {})
 4982                    .get("proband", {})
 4983                    .get("subject", {})
 4984                    .get("id", None)
 4985                )
 4986                sample = []
 4987                if sample_subject:
 4988                    sample.append(sample_subject)
 4989                if sample_proband:
 4990                    sample.append(sample_proband)
 4991
 4992                # Get sample ID within Pedigree
 4993                pedigree_persons_list = (
 4994                    param_exomiser_analysis_dict.get("phenopacket", {})
 4995                    .get("pedigree", {})
 4996                    .get("persons", {})
 4997                )
 4998
 4999                # Create list with all sample ID in pedigree (if exists)
 5000                pedigree_persons = []
 5001                for person in pedigree_persons_list:
 5002                    pedigree_persons.append(person.get("individualId"))
 5003
 5004                # Concat subject sample ID and samples ID in pedigreesamples
 5005                samples = list(set(sample + pedigree_persons))
 5006
 5007                # Check if sample list is not empty
 5008                if not samples:
 5009                    log.error(f"No samples found")
 5010                    raise ValueError(f"No samples found")
 5011
 5012                # Create VCF with sample (either sample in param or first one by default)
 5013                # Export VCF file
 5014                self.export_variant_vcf(
 5015                    vcf_file=tmp_vcf_name,
 5016                    remove_info=True,
 5017                    add_samples=True,
 5018                    list_samples=samples,
 5019                    index=False,
 5020                )
 5021
 5022                ### Execute Exomiser ###
 5023                ########################
 5024
 5025                # Init command
 5026                exomiser_command = ""
 5027
 5028                # Command exomiser options
 5029                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
 5030
 5031                # Release
 5032                exomiser_release = param_exomiser.get("release", None)
 5033                if exomiser_release:
 5034                    # phenotype data version
 5035                    exomiser_options += (
 5036                        f" --exomiser.phenotype.data-version={exomiser_release} "
 5037                    )
 5038                    # data version
 5039                    exomiser_options += (
 5040                        f" --exomiser.{assembly}.data-version={exomiser_release} "
 5041                    )
 5042                    # variant white list
 5043                    variant_white_list_file = (
 5044                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
 5045                    )
 5046                    if os.path.exists(
 5047                        os.path.join(
 5048                            databases_folders, assembly, variant_white_list_file
 5049                        )
 5050                    ):
 5051                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
 5052
 5053                # transcript_source
 5054                transcript_source = param_exomiser.get(
 5055                    "transcript_source", None
 5056                )  # ucsc, refseq, ensembl
 5057                if transcript_source:
 5058                    exomiser_options += (
 5059                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
 5060                    )
 5061
 5062                # If analysis contain proband param
 5063                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
 5064                    "proband", {}
 5065                ):
 5066                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
 5067
 5068                # If no proband (usually uniq sample)
 5069                else:
 5070                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
 5071
 5072                # Log
 5073                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
 5074
 5075                # Run command
 5076                result = subprocess.call(
 5077                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
 5078                )
 5079                if result:
 5080                    log.error("Exomiser command failed")
 5081                    raise ValueError("Exomiser command failed")
 5082
 5083                ### RESULTS ###
 5084                ###############
 5085
 5086                ### Annotate with TSV fields ###
 5087
 5088                # Init result tsv file
 5089                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
 5090
 5091                # Init result tsv file
 5092                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
 5093
 5094                # Parse TSV file and explode columns in INFO field
 5095                if exomiser_to_info and os.path.exists(output_results_tsv):
 5096
 5097                    # Log
 5098                    log.debug("Exomiser columns to VCF INFO field")
 5099
 5100                    # Retrieve columns and types
 5101                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
 5102                    output_results_tsv_df = self.get_query_to_df(query)
 5103                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
 5104
 5105                    # Init concat fields for update
 5106                    sql_query_update_concat_fields = []
 5107
 5108                    # Fields to avoid
 5109                    fields_to_avoid = [
 5110                        "CONTIG",
 5111                        "START",
 5112                        "END",
 5113                        "REF",
 5114                        "ALT",
 5115                        "QUAL",
 5116                        "FILTER",
 5117                        "GENOTYPE",
 5118                    ]
 5119
 5120                    # List all columns to add into header
 5121                    for header_column in output_results_tsv_columns:
 5122
 5123                        # If header column is enable
 5124                        if header_column not in fields_to_avoid:
 5125
 5126                            # Header info type
 5127                            header_info_type = "String"
 5128                            header_column_df = output_results_tsv_df[header_column]
 5129                            header_column_df_dtype = header_column_df.dtype
 5130                            if header_column_df_dtype == object:
 5131                                if (
 5132                                    pd.to_numeric(header_column_df, errors="coerce")
 5133                                    .notnull()
 5134                                    .all()
 5135                                ):
 5136                                    header_info_type = "Float"
 5137                            else:
 5138                                header_info_type = "Integer"
 5139
 5140                            # Header info
 5141                            characters_to_validate = ["-"]
 5142                            pattern = "[" + "".join(characters_to_validate) + "]"
 5143                            header_info_name = re.sub(
 5144                                pattern,
 5145                                "_",
 5146                                f"Exomiser_{header_column}".replace("#", ""),
 5147                            )
 5148                            header_info_number = "."
 5149                            header_info_description = (
 5150                                f"Exomiser {header_column} annotation"
 5151                            )
 5152                            header_info_source = "Exomiser"
 5153                            header_info_version = "unknown"
 5154                            header_info_code = CODE_TYPE_MAP[header_info_type]
 5155                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
 5156                                header_info_name,
 5157                                header_info_number,
 5158                                header_info_type,
 5159                                header_info_description,
 5160                                header_info_source,
 5161                                header_info_version,
 5162                                header_info_code,
 5163                            )
 5164
 5165                            # Add field to add for update to concat fields
 5166                            sql_query_update_concat_fields.append(
 5167                                f"""
 5168                                CASE
 5169                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
 5170                                    THEN concat(
 5171                                        '{header_info_name}=',
 5172                                        table_parquet."{header_column}",
 5173                                        ';'
 5174                                        )
 5175
 5176                                    ELSE ''
 5177                                END
 5178                            """
 5179                            )
 5180
 5181                    # Update query
 5182                    sql_query_update = f"""
 5183                        UPDATE {table_variants} as table_variants
 5184                            SET INFO = concat(
 5185                                            CASE
 5186                                                WHEN INFO NOT IN ('', '.')
 5187                                                THEN INFO
 5188                                                ELSE ''
 5189                                            END,
 5190                                            CASE
 5191                                                WHEN table_variants.INFO NOT IN ('','.')
 5192                                                THEN ';'
 5193                                                ELSE ''
 5194                                            END,
 5195                                            (
 5196                                            SELECT 
 5197                                                concat(
 5198                                                    {",".join(sql_query_update_concat_fields)}
 5199                                                )
 5200                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
 5201                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
 5202                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
 5203                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 5204                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 5205                                            )
 5206                                        )
 5207                            ;
 5208                        """
 5209
 5210                    # Update
 5211                    self.conn.execute(sql_query_update)
 5212
 5213                ### Annotate with VCF INFO field ###
 5214
 5215                # Init result VCF file
 5216                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
 5217
 5218                # If VCF exists
 5219                if os.path.exists(output_results_vcf):
 5220
 5221                    # Log
 5222                    log.debug("Exomiser result VCF update variants")
 5223
 5224                    # Find Exomiser INFO field annotation in header
 5225                    with gzip.open(output_results_vcf, "rt") as f:
 5226                        header_list = self.read_vcf_header(f)
 5227                    exomiser_vcf_header = vcf.Reader(
 5228                        io.StringIO("\n".join(header_list))
 5229                    )
 5230
 5231                    # Add annotation INFO field to header
 5232                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
 5233
 5234                    # Update variants with VCF
 5235                    self.update_from_vcf(output_results_vcf)
 5236
 5237        return True
 5238
 5239    def annotation_snpeff(self, threads: int = None) -> None:
 5240        """
 5241        This function annotate with snpEff
 5242
 5243        :param threads: The number of threads to use
 5244        :return: the value of the variable "return_value".
 5245        """
 5246
 5247        # DEBUG
 5248        log.debug("Start annotation with snpeff databases")
 5249
 5250        # Threads
 5251        if not threads:
 5252            threads = self.get_threads()
 5253        log.debug("Threads: " + str(threads))
 5254
 5255        # DEBUG
 5256        delete_tmp = True
 5257        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5258            delete_tmp = False
 5259            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5260
 5261        # Config
 5262        config = self.get_config()
 5263        log.debug("Config: " + str(config))
 5264
 5265        # Config - Folders - Databases
 5266        databases_folders = (
 5267            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
 5268        )
 5269        log.debug("Databases annotations: " + str(databases_folders))
 5270
 5271        # Config - snpEff bin command
 5272        snpeff_bin_command = get_bin_command(
 5273            bin="snpEff.jar",
 5274            tool="snpeff",
 5275            bin_type="jar",
 5276            config=config,
 5277            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 5278        )
 5279        if not snpeff_bin_command:
 5280            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
 5281            log.error(msg_err)
 5282            raise ValueError(msg_err)
 5283
 5284        # Config - snpEff databases
 5285        snpeff_databases = (
 5286            config.get("folders", {})
 5287            .get("databases", {})
 5288            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
 5289        )
 5290        snpeff_databases = full_path(snpeff_databases)
 5291        if snpeff_databases is not None and snpeff_databases != "":
 5292            log.debug(f"Create snpEff databases folder")
 5293            if not os.path.exists(snpeff_databases):
 5294                os.makedirs(snpeff_databases)
 5295
 5296        # Param
 5297        param = self.get_param()
 5298        log.debug("Param: " + str(param))
 5299
 5300        # Param
 5301        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
 5302        log.debug("Options: " + str(options))
 5303
 5304        # Param - Assembly
 5305        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5306
 5307        # Param - Options
 5308        snpeff_options = (
 5309            param.get("annotation", {}).get("snpeff", {}).get("options", "")
 5310        )
 5311        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
 5312        snpeff_csvstats = (
 5313            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
 5314        )
 5315        if snpeff_stats:
 5316            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
 5317            snpeff_stats = full_path(snpeff_stats)
 5318            snpeff_options += f" -stats {snpeff_stats}"
 5319        if snpeff_csvstats:
 5320            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
 5321            snpeff_csvstats = full_path(snpeff_csvstats)
 5322            snpeff_options += f" -csvStats {snpeff_csvstats}"
 5323
 5324        # Data
 5325        table_variants = self.get_table_variants()
 5326
 5327        # Check if not empty
 5328        log.debug("Check if not empty")
 5329        sql_query_chromosomes = (
 5330            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5331        )
 5332        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
 5333        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 5334            log.info(f"VCF empty")
 5335            return
 5336
 5337        # Export in VCF
 5338        log.debug("Create initial file to annotate")
 5339        tmp_vcf = NamedTemporaryFile(
 5340            prefix=self.get_prefix(),
 5341            dir=self.get_tmp_dir(),
 5342            suffix=".vcf.gz",
 5343            delete=True,
 5344        )
 5345        tmp_vcf_name = tmp_vcf.name
 5346
 5347        # VCF header
 5348        vcf_reader = self.get_header()
 5349        log.debug("Initial header: " + str(vcf_reader.infos))
 5350
 5351        # Existing annotations
 5352        for vcf_annotation in self.get_header().infos:
 5353
 5354            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5355            log.debug(
 5356                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5357            )
 5358
 5359        # Memory limit
 5360        # if config.get("memory", None):
 5361        #     memory_limit = config.get("memory", "8G")
 5362        # else:
 5363        #     memory_limit = "8G"
 5364        memory_limit = self.get_memory("8G")
 5365        log.debug(f"memory_limit: {memory_limit}")
 5366
 5367        # snpEff java options
 5368        snpeff_java_options = (
 5369            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 5370        )
 5371        log.debug(f"Exomiser java options: {snpeff_java_options}")
 5372
 5373        force_update_annotation = True
 5374
 5375        if "ANN" not in self.get_header().infos or force_update_annotation:
 5376
 5377            # Check snpEff database
 5378            log.debug(f"Check snpEff databases {[assembly]}")
 5379            databases_download_snpeff(
 5380                folder=snpeff_databases, assemblies=[assembly], config=config
 5381            )
 5382
 5383            # Export VCF file
 5384            self.export_variant_vcf(
 5385                vcf_file=tmp_vcf_name,
 5386                remove_info=True,
 5387                add_samples=False,
 5388                index=True,
 5389            )
 5390
 5391            # Tmp file
 5392            err_files = []
 5393            tmp_annotate_vcf = NamedTemporaryFile(
 5394                prefix=self.get_prefix(),
 5395                dir=self.get_tmp_dir(),
 5396                suffix=".vcf",
 5397                delete=False,
 5398            )
 5399            tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5400            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5401            err_files.append(tmp_annotate_vcf_name_err)
 5402
 5403            # Command
 5404            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
 5405            log.debug(f"Annotation - snpEff command: {snpeff_command}")
 5406            run_parallel_commands([snpeff_command], 1)
 5407
 5408            # Error messages
 5409            log.info(f"Error/Warning messages:")
 5410            error_message_command_all = []
 5411            error_message_command_warning = []
 5412            error_message_command_err = []
 5413            for err_file in err_files:
 5414                with open(err_file, "r") as f:
 5415                    for line in f:
 5416                        message = line.strip()
 5417                        error_message_command_all.append(message)
 5418                        if line.startswith("[W::"):
 5419                            error_message_command_warning.append(message)
 5420                        if line.startswith("[E::"):
 5421                            error_message_command_err.append(f"{err_file}: " + message)
 5422            # log info
 5423            for message in list(
 5424                set(error_message_command_err + error_message_command_warning)
 5425            ):
 5426                log.info(f"   {message}")
 5427            # debug info
 5428            for message in list(set(error_message_command_all)):
 5429                log.debug(f"   {message}")
 5430            # failed
 5431            if len(error_message_command_err):
 5432                log.error("Annotation failed: Error in commands")
 5433                raise ValueError("Annotation failed: Error in commands")
 5434
 5435            # Find annotation in header
 5436            with open(tmp_annotate_vcf_name, "rt") as f:
 5437                header_list = self.read_vcf_header(f)
 5438            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5439
 5440            for ann in annovar_vcf_header.infos:
 5441                if ann not in self.get_header().infos:
 5442                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5443
 5444            # Update variants
 5445            log.info(f"Annotation - Updating...")
 5446            self.update_from_vcf(tmp_annotate_vcf_name)
 5447
 5448        else:
 5449            if "ANN" in self.get_header().infos:
 5450                log.debug(f"Existing snpEff annotations in VCF")
 5451            if force_update_annotation:
 5452                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
 5453
 5454    def annotation_annovar(self, threads: int = None) -> None:
 5455        """
 5456        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
 5457        annotations
 5458
 5459        :param threads: number of threads to use
 5460        :return: the value of the variable "return_value".
 5461        """
 5462
 5463        # DEBUG
 5464        log.debug("Start annotation with Annovar databases")
 5465
 5466        # Threads
 5467        if not threads:
 5468            threads = self.get_threads()
 5469        log.debug("Threads: " + str(threads))
 5470
 5471        # Tmp en Err files
 5472        tmp_files = []
 5473        err_files = []
 5474
 5475        # DEBUG
 5476        delete_tmp = True
 5477        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5478            delete_tmp = False
 5479            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5480
 5481        # Config
 5482        config = self.get_config()
 5483        log.debug("Config: " + str(config))
 5484
 5485        # Config - Folders - Databases
 5486        databases_folders = (
 5487            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
 5488        )
 5489        log.debug("Databases annotations: " + str(databases_folders))
 5490
 5491        # Config - annovar bin command
 5492        annovar_bin_command = get_bin_command(
 5493            bin="table_annovar.pl",
 5494            tool="annovar",
 5495            bin_type="perl",
 5496            config=config,
 5497            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
 5498        )
 5499        if not annovar_bin_command:
 5500            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
 5501            log.error(msg_err)
 5502            raise ValueError(msg_err)
 5503
 5504        # Config - BCFTools bin command
 5505        bcftools_bin_command = get_bin_command(
 5506            bin="bcftools",
 5507            tool="bcftools",
 5508            bin_type="bin",
 5509            config=config,
 5510            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 5511        )
 5512        if not bcftools_bin_command:
 5513            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 5514            log.error(msg_err)
 5515            raise ValueError(msg_err)
 5516
 5517        # Config - annovar databases
 5518        annovar_databases = (
 5519            config.get("folders", {})
 5520            .get("databases", {})
 5521            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
 5522        )
 5523        if annovar_databases is not None:
 5524            if isinstance(annovar_databases, list):
 5525                annovar_databases = full_path(annovar_databases[0])
 5526                log.warning(f"Annovar databases folder '{annovar_databases}' selected")
 5527            annovar_databases = full_path(annovar_databases)
 5528            if not os.path.exists(annovar_databases):
 5529                log.info(f"Annovar databases folder '{annovar_databases}' created")
 5530                Path(annovar_databases).mkdir(parents=True, exist_ok=True)
 5531        else:
 5532            msg_err = f"Annovar databases configuration failed"
 5533            log.error(msg_err)
 5534            raise ValueError(msg_err)
 5535
 5536        # Param
 5537        param = self.get_param()
 5538        log.debug("Param: " + str(param))
 5539
 5540        # Param - options
 5541        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
 5542        log.debug("Options: " + str(options))
 5543
 5544        # Param - annotations
 5545        annotations = (
 5546            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
 5547        )
 5548        log.debug("Annotations: " + str(annotations))
 5549
 5550        # Param - Assembly
 5551        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5552
 5553        # Annovar database assembly
 5554        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
 5555        if annovar_databases_assembly != "" and not os.path.exists(
 5556            annovar_databases_assembly
 5557        ):
 5558            os.makedirs(annovar_databases_assembly)
 5559
 5560        # Data
 5561        table_variants = self.get_table_variants()
 5562
 5563        # Check if not empty
 5564        log.debug("Check if not empty")
 5565        sql_query_chromosomes = (
 5566            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5567        )
 5568        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 5569        if not sql_query_chromosomes_df["count"][0]:
 5570            log.info(f"VCF empty")
 5571            return
 5572
 5573        # VCF header
 5574        vcf_reader = self.get_header()
 5575        log.debug("Initial header: " + str(vcf_reader.infos))
 5576
 5577        # Existing annotations
 5578        for vcf_annotation in self.get_header().infos:
 5579
 5580            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5581            log.debug(
 5582                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5583            )
 5584
 5585        force_update_annotation = True
 5586
 5587        if annotations:
 5588
 5589            commands = []
 5590            tmp_annotates_vcf_name_list = []
 5591
 5592            # Export in VCF
 5593            log.debug("Create initial file to annotate")
 5594            tmp_vcf = NamedTemporaryFile(
 5595                prefix=self.get_prefix(),
 5596                dir=self.get_tmp_dir(),
 5597                suffix=".vcf.gz",
 5598                delete=False,
 5599            )
 5600            tmp_vcf_name = tmp_vcf.name
 5601            tmp_files.append(tmp_vcf_name)
 5602            tmp_files.append(tmp_vcf_name + ".tbi")
 5603
 5604            # Export VCF file
 5605            self.export_variant_vcf(
 5606                vcf_file=tmp_vcf_name,
 5607                remove_info=".",
 5608                add_samples=False,
 5609                index=True,
 5610            )
 5611
 5612            # Create file for field rename
 5613            log.debug("Create file for field rename")
 5614            tmp_rename = NamedTemporaryFile(
 5615                prefix=self.get_prefix(),
 5616                dir=self.get_tmp_dir(),
 5617                suffix=".rename",
 5618                delete=False,
 5619            )
 5620            tmp_rename_name = tmp_rename.name
 5621            tmp_files.append(tmp_rename_name)
 5622
 5623            # Check Annovar database
 5624            log.debug(
 5625                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
 5626            )
 5627            databases_download_annovar(
 5628                folder=annovar_databases,
 5629                files=list(annotations.keys()),
 5630                assemblies=[assembly],
 5631            )
 5632
 5633            for annotation in annotations:
 5634                annotation_fields = annotations[annotation]
 5635
 5636                if not annotation_fields:
 5637                    annotation_fields = {"INFO": None}
 5638
 5639                log.info(f"Annotations Annovar - database '{annotation}'")
 5640                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
 5641
 5642                # Tmp file for annovar
 5643                err_files = []
 5644                tmp_annotate_vcf_directory = TemporaryDirectory(
 5645                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
 5646                )
 5647                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
 5648                tmp_annotate_vcf_name_annovar = (
 5649                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
 5650                )
 5651                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
 5652                err_files.append(tmp_annotate_vcf_name_err)
 5653                tmp_files.append(tmp_annotate_vcf_name_err)
 5654
 5655                # Tmp file final vcf annotated by annovar
 5656                tmp_annotate_vcf = NamedTemporaryFile(
 5657                    prefix=self.get_prefix(),
 5658                    dir=self.get_tmp_dir(),
 5659                    suffix=".vcf.gz",
 5660                    delete=False,
 5661                )
 5662                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5663                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
 5664                tmp_files.append(tmp_annotate_vcf_name)
 5665                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
 5666
 5667                # Number of fields
 5668                annotation_list = []
 5669                annotation_renamed_list = []
 5670
 5671                for annotation_field in annotation_fields:
 5672
 5673                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 5674                    annotation_fields_new_name = annotation_fields.get(
 5675                        annotation_field, annotation_field
 5676                    )
 5677                    if not annotation_fields_new_name:
 5678                        annotation_fields_new_name = annotation_field
 5679
 5680                    if (
 5681                        force_update_annotation
 5682                        or annotation_fields_new_name not in self.get_header().infos
 5683                    ):
 5684                        annotation_list.append(annotation_field)
 5685                        annotation_renamed_list.append(annotation_fields_new_name)
 5686                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
 5687                        log.warning(
 5688                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 5689                        )
 5690
 5691                    # Add rename info
 5692                    run_parallel_commands(
 5693                        [
 5694                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
 5695                        ],
 5696                        1,
 5697                    )
 5698
 5699                # log.debug("fields_to_removed: " + str(fields_to_removed))
 5700                log.debug("annotation_list: " + str(annotation_list))
 5701
 5702                # protocol
 5703                protocol = annotation
 5704
 5705                # argument
 5706                argument = ""
 5707
 5708                # operation
 5709                operation = "f"
 5710                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
 5711                    "ensGene"
 5712                ):
 5713                    operation = "g"
 5714                    if options.get("genebase", None):
 5715                        argument = f"""'{options.get("genebase","")}'"""
 5716                elif annotation in ["cytoBand"]:
 5717                    operation = "r"
 5718
 5719                # argument option
 5720                argument_option = ""
 5721                if argument != "":
 5722                    argument_option = " --argument " + argument
 5723
 5724                # command options
 5725                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
 5726                for option in options:
 5727                    if option not in ["genebase"]:
 5728                        command_options += f""" --{option}={options[option]}"""
 5729
 5730                # Command
 5731
 5732                # Command - Annovar
 5733                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
 5734                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
 5735
 5736                # Command - start pipe
 5737                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
 5738
 5739                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
 5740                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
 5741
 5742                # Command - Special characters (refGene annotation)
 5743                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
 5744
 5745                # Command - Clean empty fields (with value ".")
 5746                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
 5747
 5748                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
 5749                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
 5750                if "ALL" not in annotation_list and "INFO" not in annotation_list:
 5751                    # for ann in annotation_renamed_list:
 5752                    for ann in annotation_list:
 5753                        annovar_fields_to_keep.append(f"^INFO/{ann}")
 5754
 5755                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
 5756
 5757                # Command - indexing
 5758                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
 5759
 5760                log.debug(f"Annotation - Annovar command: {command_annovar}")
 5761                run_parallel_commands([command_annovar], 1)
 5762
 5763                # Error messages
 5764                log.info(f"Error/Warning messages:")
 5765                error_message_command_all = []
 5766                error_message_command_warning = []
 5767                error_message_command_err = []
 5768                for err_file in err_files:
 5769                    with open(err_file, "r") as f:
 5770                        for line in f:
 5771                            message = line.strip()
 5772                            error_message_command_all.append(message)
 5773                            if line.startswith("[W::") or line.startswith("WARNING"):
 5774                                error_message_command_warning.append(message)
 5775                            if line.startswith("[E::") or line.startswith("ERROR"):
 5776                                error_message_command_err.append(
 5777                                    f"{err_file}: " + message
 5778                                )
 5779                # log info
 5780                for message in list(
 5781                    set(error_message_command_err + error_message_command_warning)
 5782                ):
 5783                    log.info(f"   {message}")
 5784                # debug info
 5785                for message in list(set(error_message_command_all)):
 5786                    log.debug(f"   {message}")
 5787                # failed
 5788                if len(error_message_command_err):
 5789                    log.error("Annotation failed: Error in commands")
 5790                    raise ValueError("Annotation failed: Error in commands")
 5791
 5792            if tmp_annotates_vcf_name_list:
 5793
 5794                # List of annotated files
 5795                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
 5796
 5797                # Tmp file
 5798                tmp_annotate_vcf = NamedTemporaryFile(
 5799                    prefix=self.get_prefix(),
 5800                    dir=self.get_tmp_dir(),
 5801                    suffix=".vcf.gz",
 5802                    delete=False,
 5803                )
 5804                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5805                tmp_files.append(tmp_annotate_vcf_name)
 5806                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5807                err_files.append(tmp_annotate_vcf_name_err)
 5808                tmp_files.append(tmp_annotate_vcf_name_err)
 5809
 5810                # Command merge
 5811                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
 5812                log.info(
 5813                    f"Annotation Annovar - Annotation merging "
 5814                    + str(len(tmp_annotates_vcf_name_list))
 5815                    + " annotated files"
 5816                )
 5817                log.debug(f"Annotation - merge command: {merge_command}")
 5818                run_parallel_commands([merge_command], 1)
 5819
 5820                # Find annotation in header
 5821                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
 5822                    header_list = self.read_vcf_header(f)
 5823                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5824
 5825                for ann in annovar_vcf_header.infos:
 5826                    if ann not in self.get_header().infos:
 5827                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5828
 5829                # Update variants
 5830                log.info(f"Annotation Annovar - Updating...")
 5831                self.update_from_vcf(tmp_annotate_vcf_name)
 5832
 5833            # Clean files
 5834            # Tmp file remove command
 5835            if True:
 5836                tmp_files_remove_command = ""
 5837                if tmp_files:
 5838                    tmp_files_remove_command = " ".join(tmp_files)
 5839                clean_command = f" rm -f {tmp_files_remove_command} "
 5840                log.debug(f"Annotation Annovar - Annotation cleaning ")
 5841                log.debug(f"Annotation - cleaning command: {clean_command}")
 5842                run_parallel_commands([clean_command], 1)
 5843
 5844    # Parquet
 5845    def annotation_parquet(self, threads: int = None) -> None:
 5846        """
 5847        It takes a VCF file, and annotates it with a parquet file
 5848
 5849        :param threads: number of threads to use for the annotation
 5850        :return: the value of the variable "result".
 5851        """
 5852
 5853        # DEBUG
 5854        log.debug("Start annotation with parquet databases")
 5855
 5856        # Threads
 5857        if not threads:
 5858            threads = self.get_threads()
 5859        log.debug("Threads: " + str(threads))
 5860
 5861        # DEBUG
 5862        delete_tmp = True
 5863        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5864            delete_tmp = False
 5865            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5866
 5867        # Config
 5868        databases_folders = set(
 5869            self.get_config()
 5870            .get("folders", {})
 5871            .get("databases", {})
 5872            .get("annotations", ["."])
 5873            + self.get_config()
 5874            .get("folders", {})
 5875            .get("databases", {})
 5876            .get("parquet", ["."])
 5877        )
 5878        log.debug("Databases annotations: " + str(databases_folders))
 5879
 5880        # Param
 5881        annotations = (
 5882            self.get_param()
 5883            .get("annotation", {})
 5884            .get("parquet", {})
 5885            .get("annotations", None)
 5886        )
 5887        log.debug("Annotations: " + str(annotations))
 5888
 5889        # Assembly
 5890        assembly = self.get_param().get(
 5891            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 5892        )
 5893
 5894        # Force Update Annotation
 5895        force_update_annotation = (
 5896            self.get_param()
 5897            .get("annotation", {})
 5898            .get("options", {})
 5899            .get("annotations_update", False)
 5900        )
 5901        log.debug(f"force_update_annotation={force_update_annotation}")
 5902        force_append_annotation = (
 5903            self.get_param()
 5904            .get("annotation", {})
 5905            .get("options", {})
 5906            .get("annotations_append", False)
 5907        )
 5908        log.debug(f"force_append_annotation={force_append_annotation}")
 5909
 5910        # Data
 5911        table_variants = self.get_table_variants()
 5912
 5913        # Check if not empty
 5914        log.debug("Check if not empty")
 5915        sql_query_chromosomes_df = self.get_query_to_df(
 5916            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
 5917        )
 5918        if not sql_query_chromosomes_df["count"][0]:
 5919            log.info(f"VCF empty")
 5920            return
 5921
 5922        # VCF header
 5923        vcf_reader = self.get_header()
 5924        log.debug("Initial header: " + str(vcf_reader.infos))
 5925
 5926        # Nb Variants POS
 5927        log.debug("NB Variants Start")
 5928        nb_variants = self.conn.execute(
 5929            f"SELECT count(*) AS count FROM variants"
 5930        ).fetchdf()["count"][0]
 5931        log.debug("NB Variants Stop")
 5932
 5933        # Existing annotations
 5934        for vcf_annotation in self.get_header().infos:
 5935
 5936            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5937            log.debug(
 5938                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5939            )
 5940
 5941        # Added columns
 5942        added_columns = []
 5943
 5944        # drop indexes
 5945        log.debug(f"Drop indexes...")
 5946        self.drop_indexes()
 5947
 5948        if annotations:
 5949
 5950            if "ALL" in annotations:
 5951
 5952                all_param = annotations.get("ALL", {})
 5953                all_param_formats = all_param.get("formats", None)
 5954                all_param_releases = all_param.get("releases", None)
 5955
 5956                databases_infos_dict = self.scan_databases(
 5957                    database_formats=all_param_formats,
 5958                    database_releases=all_param_releases,
 5959                )
 5960                for database_infos in databases_infos_dict.keys():
 5961                    if database_infos not in annotations:
 5962                        annotations[database_infos] = {"INFO": None}
 5963
 5964            for annotation in annotations:
 5965
 5966                if annotation in ["ALL"]:
 5967                    continue
 5968
 5969                # Annotation Name
 5970                annotation_name = os.path.basename(annotation)
 5971
 5972                # Annotation fields
 5973                annotation_fields = annotations[annotation]
 5974                if not annotation_fields:
 5975                    annotation_fields = {"INFO": None}
 5976
 5977                log.debug(f"Annotation '{annotation_name}'")
 5978                log.debug(
 5979                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 5980                )
 5981
 5982                # Create Database
 5983                database = Database(
 5984                    database=annotation,
 5985                    databases_folders=databases_folders,
 5986                    assembly=assembly,
 5987                )
 5988
 5989                # Find files
 5990                parquet_file = database.get_database()
 5991                parquet_hdr_file = database.get_header_file()
 5992                parquet_type = database.get_type()
 5993
 5994                # Check if files exists
 5995                if not parquet_file or not parquet_hdr_file:
 5996                    msg_err_list = []
 5997                    if not parquet_file:
 5998                        msg_err_list.append(
 5999                            f"Annotation failed: Annotation file not found"
 6000                        )
 6001                    if parquet_file and not parquet_hdr_file:
 6002                        msg_err_list.append(
 6003                            f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'"
 6004                        )
 6005
 6006                    log.error(". ".join(msg_err_list))
 6007                    raise ValueError(". ".join(msg_err_list))
 6008                else:
 6009                    # Get parquet connexion
 6010                    parquet_sql_attach = database.get_sql_database_attach(
 6011                        output="query"
 6012                    )
 6013                    if parquet_sql_attach:
 6014                        self.conn.execute(parquet_sql_attach)
 6015                    parquet_file_link = database.get_sql_database_link()
 6016                    # Log
 6017                    log.debug(
 6018                        f"Annotation '{annotation_name}' - file: "
 6019                        + str(parquet_file)
 6020                        + " and "
 6021                        + str(parquet_hdr_file)
 6022                    )
 6023
 6024                    # Database full header columns
 6025                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
 6026                        parquet_hdr_file
 6027                    )
 6028                    # Log
 6029                    log.debug(
 6030                        "Annotation database header columns : "
 6031                        + str(parquet_hdr_vcf_header_columns)
 6032                    )
 6033
 6034                    # Load header as VCF object
 6035                    parquet_hdr_vcf_header_infos = database.get_header().infos
 6036                    # Log
 6037                    log.debug(
 6038                        "Annotation database header: "
 6039                        + str(parquet_hdr_vcf_header_infos)
 6040                    )
 6041
 6042                    # Get extra infos
 6043                    parquet_columns = database.get_extra_columns()
 6044                    # Log
 6045                    log.debug("Annotation database Columns: " + str(parquet_columns))
 6046
 6047                    # Add extra columns if "ALL" in annotation_fields
 6048                    # if "ALL" in annotation_fields:
 6049                    #     allow_add_extra_column = True
 6050                    if "ALL" in annotation_fields and database.get_extra_columns():
 6051                        for extra_column in database.get_extra_columns():
 6052                            if (
 6053                                extra_column not in annotation_fields
 6054                                and extra_column.replace("INFO/", "")
 6055                                not in parquet_hdr_vcf_header_infos
 6056                            ):
 6057                                parquet_hdr_vcf_header_infos[extra_column] = (
 6058                                    vcf.parser._Info(
 6059                                        extra_column,
 6060                                        ".",
 6061                                        "String",
 6062                                        f"{extra_column} description",
 6063                                        "unknown",
 6064                                        "unknown",
 6065                                        self.code_type_map["String"],
 6066                                    )
 6067                                )
 6068
 6069                    # For all fields in database
 6070                    annotation_fields_all = False
 6071                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 6072                        annotation_fields_all = True
 6073                        annotation_fields = {
 6074                            key: key for key in parquet_hdr_vcf_header_infos
 6075                        }
 6076
 6077                        log.debug(
 6078                            "Annotation database header - All annotations added: "
 6079                            + str(annotation_fields)
 6080                        )
 6081
 6082                    # Init
 6083
 6084                    # List of annotation fields to use
 6085                    sql_query_annotation_update_info_sets = []
 6086
 6087                    # List of annotation to agregate
 6088                    sql_query_annotation_to_agregate = []
 6089
 6090                    # Number of fields
 6091                    nb_annotation_field = 0
 6092
 6093                    # Annotation fields processed
 6094                    annotation_fields_processed = []
 6095
 6096                    # Columns mapping
 6097                    map_columns = database.map_columns(
 6098                        columns=annotation_fields, prefixes=["INFO/"]
 6099                    )
 6100
 6101                    # Query dict for fields to remove (update option)
 6102                    query_dict_remove = {}
 6103
 6104                    # Fetch Anotation fields
 6105                    for annotation_field in annotation_fields:
 6106
 6107                        # annotation_field_column
 6108                        annotation_field_column = map_columns.get(
 6109                            annotation_field, "INFO"
 6110                        )
 6111
 6112                        # field new name, if parametered
 6113                        annotation_fields_new_name = annotation_fields.get(
 6114                            annotation_field, annotation_field
 6115                        )
 6116                        if not annotation_fields_new_name:
 6117                            annotation_fields_new_name = annotation_field
 6118
 6119                        # To annotate
 6120                        # force_update_annotation = True
 6121                        # force_append_annotation = True
 6122                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
 6123                        if annotation_field in parquet_hdr_vcf_header_infos and (
 6124                            force_update_annotation
 6125                            or force_append_annotation
 6126                            or (
 6127                                annotation_fields_new_name
 6128                                not in self.get_header().infos
 6129                            )
 6130                        ):
 6131
 6132                            # Add field to annotation to process list
 6133                            annotation_fields_processed.append(
 6134                                annotation_fields_new_name
 6135                            )
 6136
 6137                            # explode infos for the field
 6138                            annotation_fields_new_name_info_msg = ""
 6139                            if (
 6140                                force_update_annotation
 6141                                and annotation_fields_new_name
 6142                                in self.get_header().infos
 6143                            ):
 6144                                # Remove field from INFO
 6145                                query = f"""
 6146                                    UPDATE {table_variants} as table_variants
 6147                                    SET INFO = REGEXP_REPLACE(
 6148                                                concat(table_variants.INFO,''),
 6149                                                ';*{annotation_fields_new_name}=[^;]*',
 6150                                                ''
 6151                                                )
 6152                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
 6153                                """
 6154                                annotation_fields_new_name_info_msg = " [update]"
 6155                                query_dict_remove[
 6156                                    f"remove 'INFO/{annotation_fields_new_name}'"
 6157                                ] = query
 6158
 6159                            # Sep between fields in INFO
 6160                            nb_annotation_field += 1
 6161                            if nb_annotation_field > 1:
 6162                                annotation_field_sep = ";"
 6163                            else:
 6164                                annotation_field_sep = ""
 6165
 6166                            log.info(
 6167                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
 6168                            )
 6169
 6170                            # Add INFO field to header
 6171                            parquet_hdr_vcf_header_infos_number = (
 6172                                parquet_hdr_vcf_header_infos[annotation_field].num
 6173                                or "."
 6174                            )
 6175                            parquet_hdr_vcf_header_infos_type = (
 6176                                parquet_hdr_vcf_header_infos[annotation_field].type
 6177                                or "String"
 6178                            )
 6179                            parquet_hdr_vcf_header_infos_description = (
 6180                                parquet_hdr_vcf_header_infos[annotation_field].desc
 6181                                or f"{annotation_field} description"
 6182                            )
 6183                            parquet_hdr_vcf_header_infos_source = (
 6184                                parquet_hdr_vcf_header_infos[annotation_field].source
 6185                                or "unknown"
 6186                            )
 6187                            parquet_hdr_vcf_header_infos_version = (
 6188                                parquet_hdr_vcf_header_infos[annotation_field].version
 6189                                or "unknown"
 6190                            )
 6191
 6192                            vcf_reader.infos[annotation_fields_new_name] = (
 6193                                vcf.parser._Info(
 6194                                    annotation_fields_new_name,
 6195                                    parquet_hdr_vcf_header_infos_number,
 6196                                    parquet_hdr_vcf_header_infos_type,
 6197                                    parquet_hdr_vcf_header_infos_description,
 6198                                    parquet_hdr_vcf_header_infos_source,
 6199                                    parquet_hdr_vcf_header_infos_version,
 6200                                    self.code_type_map[
 6201                                        parquet_hdr_vcf_header_infos_type
 6202                                    ],
 6203                                )
 6204                            )
 6205
 6206                            # Append
 6207                            if force_append_annotation:
 6208                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
 6209                            else:
 6210                                query_case_when_append = ""
 6211
 6212                            # Annotation/Update query fields
 6213                            # Found in INFO column
 6214                            if (
 6215                                annotation_field_column == "INFO"
 6216                                and "INFO" in parquet_hdr_vcf_header_columns
 6217                            ):
 6218                                sql_query_annotation_update_info_sets.append(
 6219                                    f"""
 6220                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
 6221                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
 6222                                        ELSE ''
 6223                                    END
 6224                                """
 6225                                )
 6226                            # Found in a specific column
 6227                            else:
 6228                                sql_query_annotation_update_info_sets.append(
 6229                                    f"""
 6230                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
 6231                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
 6232                                        ELSE ''
 6233                                    END
 6234                                """
 6235                                )
 6236                                sql_query_annotation_to_agregate.append(
 6237                                    f""" string_agg(table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
 6238                                )
 6239
 6240                        # Not to annotate
 6241                        else:
 6242
 6243                            if force_update_annotation:
 6244                                annotation_message = "forced"
 6245                            else:
 6246                                annotation_message = "skipped"
 6247
 6248                            if annotation_field not in parquet_hdr_vcf_header_infos:
 6249                                log.warning(
 6250                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
 6251                                )
 6252                            if annotation_fields_new_name in self.get_header().infos:
 6253                                log.warning(
 6254                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
 6255                                )
 6256
 6257                    # Check if ALL fields have to be annotated. Thus concat all INFO field
 6258                    # allow_annotation_full_info = True
 6259                    allow_annotation_full_info = not force_append_annotation
 6260
 6261                    if parquet_type in ["regions"]:
 6262                        allow_annotation_full_info = False
 6263
 6264                    if (
 6265                        allow_annotation_full_info
 6266                        and nb_annotation_field == len(annotation_fields)
 6267                        and annotation_fields_all
 6268                        and (
 6269                            "INFO" in parquet_hdr_vcf_header_columns
 6270                            and "INFO" in database.get_extra_columns()
 6271                        )
 6272                    ):
 6273                        log.debug("Column INFO annotation enabled")
 6274                        sql_query_annotation_update_info_sets = []
 6275                        sql_query_annotation_update_info_sets.append(
 6276                            f" table_parquet.INFO "
 6277                        )
 6278
 6279                    if sql_query_annotation_update_info_sets:
 6280
 6281                        # Annotate
 6282                        log.info(f"Annotation '{annotation_name}' - Annotation...")
 6283
 6284                        # Join query annotation update info sets for SQL
 6285                        sql_query_annotation_update_info_sets_sql = ",".join(
 6286                            sql_query_annotation_update_info_sets
 6287                        )
 6288
 6289                        # Check chromosomes list (and variants infos)
 6290                        sql_query_chromosomes = f"""
 6291                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
 6292                            FROM {table_variants} as table_variants
 6293                            GROUP BY table_variants."#CHROM"
 6294                            ORDER BY table_variants."#CHROM"
 6295                            """
 6296                        sql_query_chromosomes_df = self.conn.execute(
 6297                            sql_query_chromosomes
 6298                        ).df()
 6299                        sql_query_chromosomes_dict = {
 6300                            entry["CHROM"]: {
 6301                                "count": entry["count_variants"],
 6302                                "min": entry["min_variants"],
 6303                                "max": entry["max_variants"],
 6304                            }
 6305                            for index, entry in sql_query_chromosomes_df.iterrows()
 6306                        }
 6307
 6308                        # Init
 6309                        nb_of_query = 0
 6310                        nb_of_variant_annotated = 0
 6311                        query_dict = query_dict_remove
 6312
 6313                        # for chrom in sql_query_chromosomes_df["CHROM"]:
 6314                        for chrom in sql_query_chromosomes_dict:
 6315
 6316                            # Number of variant by chromosome
 6317                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
 6318                                chrom, {}
 6319                            ).get("count", 0)
 6320
 6321                            log.debug(
 6322                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
 6323                            )
 6324
 6325                            # Annotation with regions database
 6326                            if parquet_type in ["regions"]:
 6327                                sql_query_annotation_from_clause = f"""
 6328                                    FROM (
 6329                                        SELECT 
 6330                                            '{chrom}' AS \"#CHROM\",
 6331                                            table_variants_from.\"POS\" AS \"POS\",
 6332                                            {",".join(sql_query_annotation_to_agregate)}
 6333                                        FROM {table_variants} as table_variants_from
 6334                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
 6335                                            table_parquet_from."#CHROM" = '{chrom}'
 6336                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
 6337                                            AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
 6338                                        )
 6339                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
 6340                                        GROUP BY table_variants_from.\"POS\"
 6341                                        )
 6342                                        as table_parquet
 6343                                """
 6344
 6345                                sql_query_annotation_where_clause = """
 6346                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
 6347                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6348                                """
 6349
 6350                            # Annotation with variants database
 6351                            else:
 6352                                sql_query_annotation_from_clause = f"""
 6353                                    FROM {parquet_file_link} as table_parquet
 6354                                """
 6355                                sql_query_annotation_where_clause = f"""
 6356                                    table_variants."#CHROM" = '{chrom}'
 6357                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
 6358                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6359                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 6360                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 6361                                """
 6362
 6363                            # Create update query
 6364                            sql_query_annotation_chrom_interval_pos = f"""
 6365                                UPDATE {table_variants} as table_variants
 6366                                    SET INFO = 
 6367                                        concat(
 6368                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6369                                                THEN table_variants.INFO
 6370                                                ELSE ''
 6371                                            END
 6372                                            ,
 6373                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6374                                                        AND (
 6375                                                        concat({sql_query_annotation_update_info_sets_sql})
 6376                                                        )
 6377                                                        NOT IN ('','.') 
 6378                                                    THEN ';'
 6379                                                    ELSE ''
 6380                                            END
 6381                                            ,
 6382                                            {sql_query_annotation_update_info_sets_sql}
 6383                                            )
 6384                                    {sql_query_annotation_from_clause}
 6385                                    WHERE {sql_query_annotation_where_clause}
 6386                                    ;
 6387                                """
 6388
 6389                            # Add update query to dict
 6390                            query_dict[
 6391                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
 6392                            ] = sql_query_annotation_chrom_interval_pos
 6393
 6394                        nb_of_query = len(query_dict)
 6395                        num_query = 0
 6396
 6397                        # SET max_expression_depth TO x
 6398                        self.conn.execute("SET max_expression_depth TO 10000")
 6399
 6400                        for query_name in query_dict:
 6401                            query = query_dict[query_name]
 6402                            num_query += 1
 6403                            log.info(
 6404                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
 6405                            )
 6406                            result = self.conn.execute(query)
 6407                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
 6408                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
 6409                            log.info(
 6410                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
 6411                            )
 6412
 6413                        log.info(
 6414                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
 6415                        )
 6416
 6417                    else:
 6418
 6419                        log.info(
 6420                            f"Annotation '{annotation_name}' - No Annotations available"
 6421                        )
 6422
 6423                    log.debug("Final header: " + str(vcf_reader.infos))
 6424
 6425        # Remove added columns
 6426        for added_column in added_columns:
 6427            self.drop_column(column=added_column)
 6428
 6429    def annotation_splice(self, threads: int = None) -> None:
 6430        """
 6431        This function annotate with snpEff
 6432
 6433        :param threads: The number of threads to use
 6434        :return: the value of the variable "return_value".
 6435        """
 6436
 6437        # DEBUG
 6438        log.debug("Start annotation with splice tools")
 6439
 6440        # Threads
 6441        if not threads:
 6442            threads = self.get_threads()
 6443        log.debug("Threads: " + str(threads))
 6444
 6445        # DEBUG
 6446        delete_tmp = True
 6447        if self.get_config().get("verbosity", "warning") in ["debug"]:
 6448            delete_tmp = False
 6449            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 6450
 6451        # Config
 6452        config = self.get_config()
 6453        log.debug("Config: " + str(config))
 6454        splice_config = config.get("tools", {}).get("splice", {})
 6455        if not splice_config:
 6456            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
 6457            msg_err = "No Splice tool config"
 6458            raise ValueError(msg_err)
 6459        log.debug(f"splice_config: {splice_config}")
 6460
 6461        # Config - Folders - Databases
 6462        databases_folders = (
 6463            config.get("folders", {}).get("databases", {}).get("splice", ["."])
 6464        )
 6465        log.debug("Databases annotations: " + str(databases_folders))
 6466
 6467        # Splice docker image
 6468        splice_docker_image = splice_config.get("docker").get("image")
 6469
 6470        # Pull splice image if it's not already there
 6471        if not check_docker_image_exists(splice_docker_image):
 6472            log.warning(
 6473                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
 6474            )
 6475            try:
 6476                command(f"docker pull {splice_config.get('docker').get('image')}")
 6477            except subprocess.CalledProcessError:
 6478                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
 6479                log.error(msg_err)
 6480                raise ValueError(msg_err)
 6481
 6482        # Config - splice databases
 6483        splice_databases = (
 6484            config.get("folders", {})
 6485            .get("databases", {})
 6486            .get("splice", DEFAULT_SPLICE_FOLDER)
 6487        )
 6488        splice_databases = full_path(splice_databases)
 6489
 6490        # Param
 6491        param = self.get_param()
 6492        log.debug("Param: " + str(param))
 6493
 6494        # Param
 6495        options = param.get("annotation", {}).get("splice", {}).get("options", {})
 6496        log.debug("Options: " + str(options))
 6497
 6498        # Data
 6499        table_variants = self.get_table_variants()
 6500
 6501        # Check if not empty
 6502        log.debug("Check if not empty")
 6503        sql_query_chromosomes = (
 6504            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 6505        )
 6506        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 6507            log.info("VCF empty")
 6508            return None
 6509
 6510        # Export in VCF
 6511        log.debug("Create initial file to annotate")
 6512
 6513        # Create output folder / work folder
 6514        if options.get("output_folder", ""):
 6515            output_folder = options.get("output_folder", "")
 6516            if not os.path.exists(output_folder):
 6517                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6518        else:
 6519            output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
 6520            if not os.path.exists(output_folder):
 6521                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6522
 6523        if options.get("workdir", ""):
 6524            workdir = options.get("workdir", "")
 6525        else:
 6526            workdir = "/work"
 6527
 6528        # Create tmp VCF file
 6529        tmp_vcf = NamedTemporaryFile(
 6530            prefix=self.get_prefix(),
 6531            dir=output_folder,
 6532            suffix=".vcf",
 6533            delete=False,
 6534        )
 6535        tmp_vcf_name = tmp_vcf.name
 6536
 6537        # VCF header
 6538        header = self.get_header()
 6539
 6540        # Existing annotations
 6541        for vcf_annotation in self.get_header().infos:
 6542
 6543            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 6544            log.debug(
 6545                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 6546            )
 6547
 6548        # Memory limit
 6549        if config.get("memory", None):
 6550            memory_limit = config.get("memory", "8G").upper()
 6551            # upper()
 6552        else:
 6553            memory_limit = "8G"
 6554        log.debug(f"memory_limit: {memory_limit}")
 6555
 6556        # Check number of variants to annotate
 6557        where_clause_regex_spliceai = r"SpliceAI_\w+"
 6558        where_clause_regex_spip = r"SPiP_\w+"
 6559        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
 6560        df_list_of_variants_to_annotate = self.get_query_to_df(
 6561            query=f""" SELECT * FROM variants {where_clause} """
 6562        )
 6563        if len(df_list_of_variants_to_annotate) == 0:
 6564            log.warning(
 6565                f"No variants to annotate with splice. Variants probably already annotated with splice"
 6566            )
 6567            return None
 6568        else:
 6569            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
 6570
 6571        # Export VCF file
 6572        self.export_variant_vcf(
 6573            vcf_file=tmp_vcf_name,
 6574            remove_info=True,
 6575            add_samples=True,
 6576            index=False,
 6577            where_clause=where_clause,
 6578        )
 6579        mount = [f" -v {path}:{path}:rw" for path in [output_folder]]
 6580        if any(value for value in splice_config.values() if value is None):
 6581            log.warning("At least one splice config parameter is empty")
 6582            # exit annotation_splice
 6583            return None
 6584
 6585        # Params in splice nf
 6586        def check_values(dico: dict):
 6587            """
 6588            Ensure parameters for NF splice pipeline
 6589            """
 6590            for key, val in dico.items():
 6591                if key == "genome":
 6592                    if any(
 6593                        assemb in options.get("genome", {})
 6594                        for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
 6595                    ):
 6596                        yield f"--{key} hg19"
 6597                    elif any(
 6598                        assemb in options.get("genome", {})
 6599                        for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
 6600                    ):
 6601                        yield f"--{key} hg38"
 6602                elif (
 6603                    (isinstance(val, str) and val)
 6604                    or isinstance(val, int)
 6605                    or isinstance(val, bool)
 6606                ):
 6607                    yield f"--{key} {val}"
 6608
 6609        # Genome
 6610        genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
 6611        options["genome"] = genome
 6612        # NF params
 6613        nf_params = []
 6614        # Add options
 6615        if options:
 6616            log.debug(options)
 6617            nf_params = list(check_values(options))
 6618            log.debug(f"Splice NF params: {' '.join(nf_params)}")
 6619        else:
 6620            log.debug("No NF params provided")
 6621        # Add threads
 6622        if "threads" not in options.keys():
 6623            nf_params.append(f"--threads {threads}")
 6624        # Genome path
 6625        genome_path = find_genome(
 6626            config.get("folders", {})
 6627            .get("databases", {})
 6628            .get("genomes", DEFAULT_GENOME_FOLDER),
 6629            file=f"{genome}.fa",
 6630        )
 6631        # Add genome path
 6632        if not genome_path:
 6633            raise ValueError(
 6634                f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
 6635            )
 6636        else:
 6637            log.debug(f"Genome: {genome_path}")
 6638            nf_params.append(f"--genome_path {genome_path}")
 6639
 6640        def splice_annotations(options: dict = {}, config: dict = {}) -> list:
 6641            """
 6642            Setting up updated databases for SPiP and SpliceAI
 6643            """
 6644
 6645            try:
 6646
 6647                # SpliceAI assembly transcriptome
 6648                spliceai_assembly = os.path.join(
 6649                    config.get("folders", {}).get("databases", {}).get("spliceai", {}),
 6650                    options.get("genome"),
 6651                    "transcriptome",
 6652                )
 6653                spip_assembly = options.get("genome")
 6654
 6655                spip = find(
 6656                    f"transcriptome_{spip_assembly}.RData",
 6657                    config.get("folders", {}).get("databases", {}).get("spip", {}),
 6658                )
 6659                spliceai = find("spliceai.refseq.txt", spliceai_assembly)
 6660                log.debug(f"SPiP annotations: {spip}")
 6661                log.debug(f"SpliceAI annotations: {spliceai}")
 6662                if spip and spliceai:
 6663                    return [
 6664                        f"--spip_transcriptome {spip}",
 6665                        f"--spliceai_transcriptome {spliceai}",
 6666                    ]
 6667                else:
 6668                    log.warning(
 6669                        "Can't find splice databases in configuration, use annotations file from image"
 6670                    )
 6671            except TypeError:
 6672                log.warning(
 6673                    "Can't find splice databases in configuration, use annotations file from image"
 6674                )
 6675                return []
 6676
 6677        # Add options, check if transcriptome option have already beend provided
 6678        if (
 6679            "spip_transcriptome" not in nf_params
 6680            and "spliceai_transcriptome" not in nf_params
 6681        ):
 6682            splice_reference = splice_annotations(options, config)
 6683            if splice_reference:
 6684                nf_params.extend(splice_reference)
 6685        # nf_params.append(f"--output_folder {output_folder}")
 6686        random_uuid = f"HOWARD-SPLICE-{get_random()}"
 6687        cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
 6688        log.debug(cmd)
 6689        splice_config["docker"]["command"] = cmd
 6690
 6691        # Ensure proxy is set
 6692        proxy = [
 6693            f"-e {var}={os.getenv(var)}"
 6694            for var in ["https_proxy", "http_proxy", "ftp_proxy"]
 6695            if os.getenv(var) is not None
 6696        ]
 6697        docker_cmd = get_bin_command(
 6698            tool="splice",
 6699            bin_type="docker",
 6700            config=config,
 6701            default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
 6702            add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}",
 6703        )
 6704        # print(docker_cmd)
 6705        # exit()
 6706        # Docker debug
 6707        # if splice_config.get("rm_container"):
 6708        #     rm_container = "--rm"
 6709        # else:
 6710        #     rm_container = ""
 6711        # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
 6712        log.debug(docker_cmd)
 6713        res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
 6714        log.debug(res.stdout)
 6715        if res.stderr:
 6716            log.error(res.stderr)
 6717        res.check_returncode()
 6718        # Update variants
 6719        log.info("Annotation - Updating...")
 6720        # Test find output vcf
 6721        log.debug(
 6722            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6723        )
 6724        output_vcf = []
 6725        # Wrong folder to look in
 6726        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
 6727            if (
 6728                files
 6729                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6730            ):
 6731                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
 6732        # log.debug(os.listdir(options.get("output_folder")))
 6733        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
 6734        if not output_vcf:
 6735            log.debug(
 6736                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
 6737            )
 6738        else:
 6739            # Get new header from annotated vcf
 6740            log.debug(f"Initial header: {len(header.infos)} fields")
 6741            # Create new header with splice infos
 6742            new_vcf = Variants(input=output_vcf[0])
 6743            new_vcf_header = new_vcf.get_header().infos
 6744            for keys, infos in new_vcf_header.items():
 6745                if keys not in header.infos.keys():
 6746                    header.infos[keys] = infos
 6747            log.debug(f"New header: {len(header.infos)} fields")
 6748            log.debug(f"Splice tmp output: {output_vcf[0]}")
 6749            self.update_from_vcf(output_vcf[0])
 6750
 6751        # Remove file
 6752        remove_if_exists(output_vcf)
 6753
 6754    ###
 6755    # Prioritization
 6756    ###
 6757
 6758    def get_config_default(self, name: str) -> dict:
 6759        """
 6760        The function `get_config_default` returns a dictionary containing default configurations for
 6761        various calculations and prioritizations.
 6762
 6763        :param name: The `get_config_default` function returns a dictionary containing default
 6764        configurations for different calculations and prioritizations. The `name` parameter is used to
 6765        specify which specific configuration to retrieve from the dictionary
 6766        :type name: str
 6767        :return: The function `get_config_default` returns a dictionary containing default configuration
 6768        settings for different calculations and prioritizations. The specific configuration settings are
 6769        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
 6770        matches a key in the `config_default` dictionary, the corresponding configuration settings are
 6771        returned. If there is no match, an empty dictionary is returned.
 6772        """
 6773
 6774        config_default = {
 6775            "calculations": {
 6776                "variant_chr_pos_alt_ref": {
 6777                    "type": "sql",
 6778                    "name": "variant_chr_pos_alt_ref",
 6779                    "description": "Create a variant ID with chromosome, position, alt and ref",
 6780                    "available": False,
 6781                    "output_column_name": "variant_chr_pos_alt_ref",
 6782                    "output_column_type": "String",
 6783                    "output_column_description": "variant ID with chromosome, position, alt and ref",
 6784                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
 6785                    "operation_info": True,
 6786                },
 6787                "VARTYPE": {
 6788                    "type": "sql",
 6789                    "name": "VARTYPE",
 6790                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
 6791                    "available": True,
 6792                    "table": "variants",
 6793                    "output_column_name": "VARTYPE",
 6794                    "output_column_type": "String",
 6795                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
 6796                    "operation_query": """
 6797                            CASE
 6798                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
 6799                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
 6800                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
 6801                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
 6802                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
 6803                                ELSE 'UNDEFINED'
 6804                            END
 6805                            """,
 6806                    "info_fields": ["SVTYPE"],
 6807                    "operation_info": True,
 6808                },
 6809                "snpeff_hgvs": {
 6810                    "type": "python",
 6811                    "name": "snpeff_hgvs",
 6812                    "description": "HGVS nomenclatures from snpEff annotation",
 6813                    "available": True,
 6814                    "function_name": "calculation_extract_snpeff_hgvs",
 6815                    "function_params": ["snpeff_hgvs", "ANN"],
 6816                },
 6817                "snpeff_ann_explode": {
 6818                    "type": "python",
 6819                    "name": "snpeff_ann_explode",
 6820                    "description": "Explode snpEff annotations with uniquify values",
 6821                    "available": True,
 6822                    "function_name": "calculation_snpeff_ann_explode",
 6823                    "function_params": [False, "fields", "snpeff_", "ANN"],
 6824                },
 6825                "snpeff_ann_explode_uniquify": {
 6826                    "type": "python",
 6827                    "name": "snpeff_ann_explode_uniquify",
 6828                    "description": "Explode snpEff annotations",
 6829                    "available": True,
 6830                    "function_name": "calculation_snpeff_ann_explode",
 6831                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
 6832                },
 6833                "snpeff_ann_explode_json": {
 6834                    "type": "python",
 6835                    "name": "snpeff_ann_explode_json",
 6836                    "description": "Explode snpEff annotations in JSON format",
 6837                    "available": True,
 6838                    "function_name": "calculation_snpeff_ann_explode",
 6839                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
 6840                },
 6841                "NOMEN": {
 6842                    "type": "python",
 6843                    "name": "NOMEN",
 6844                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)",
 6845                    "available": True,
 6846                    "function_name": "calculation_extract_nomen",
 6847                    "function_params": [],
 6848                },
 6849                "RENAME_INFO_FIELDS": {
 6850                    "type": "python",
 6851                    "name": "RENAME_INFO_FIELDS",
 6852                    "description": "Rename or remove INFO/tags",
 6853                    "available": True,
 6854                    "function_name": "calculation_rename_info_fields",
 6855                    "function_params": [],
 6856                },
 6857                "FINDBYPIPELINE": {
 6858                    "type": "python",
 6859                    "name": "FINDBYPIPELINE",
 6860                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
 6861                    "available": True,
 6862                    "function_name": "calculation_find_by_pipeline",
 6863                    "function_params": ["findbypipeline"],
 6864                },
 6865                "FINDBYSAMPLE": {
 6866                    "type": "python",
 6867                    "name": "FINDBYSAMPLE",
 6868                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
 6869                    "available": True,
 6870                    "function_name": "calculation_find_by_pipeline",
 6871                    "function_params": ["findbysample"],
 6872                },
 6873                "GENOTYPECONCORDANCE": {
 6874                    "type": "python",
 6875                    "name": "GENOTYPECONCORDANCE",
 6876                    "description": "Concordance of genotype for multi caller VCF",
 6877                    "available": True,
 6878                    "function_name": "calculation_genotype_concordance",
 6879                    "function_params": [],
 6880                },
 6881                "BARCODE": {
 6882                    "type": "python",
 6883                    "name": "BARCODE",
 6884                    "description": "BARCODE as VaRank tool",
 6885                    "available": True,
 6886                    "function_name": "calculation_barcode",
 6887                    "function_params": [],
 6888                },
 6889                "BARCODEFAMILY": {
 6890                    "type": "python",
 6891                    "name": "BARCODEFAMILY",
 6892                    "description": "BARCODEFAMILY as VaRank tool",
 6893                    "available": True,
 6894                    "function_name": "calculation_barcode_family",
 6895                    "function_params": ["BCF"],
 6896                },
 6897                "TRIO": {
 6898                    "type": "python",
 6899                    "name": "TRIO",
 6900                    "description": "Inheritance for a trio family",
 6901                    "available": True,
 6902                    "function_name": "calculation_trio",
 6903                    "function_params": [],
 6904                },
 6905                "VAF": {
 6906                    "type": "python",
 6907                    "name": "VAF",
 6908                    "description": "Variant Allele Frequency (VAF) harmonization",
 6909                    "available": True,
 6910                    "function_name": "calculation_vaf_normalization",
 6911                    "function_params": [],
 6912                },
 6913                "VAF_stats": {
 6914                    "type": "python",
 6915                    "name": "VAF_stats",
 6916                    "description": "Variant Allele Frequency (VAF) statistics",
 6917                    "available": True,
 6918                    "function_name": "calculation_genotype_stats",
 6919                    "function_params": ["VAF"],
 6920                },
 6921                "DP_stats": {
 6922                    "type": "python",
 6923                    "name": "DP_stats",
 6924                    "description": "Depth (DP) statistics",
 6925                    "available": True,
 6926                    "function_name": "calculation_genotype_stats",
 6927                    "function_params": ["DP"],
 6928                },
 6929                "variant_id": {
 6930                    "type": "python",
 6931                    "name": "variant_id",
 6932                    "description": "Variant ID generated from variant position and type",
 6933                    "available": True,
 6934                    "function_name": "calculation_variant_id",
 6935                    "function_params": [],
 6936                },
 6937                "transcripts_json": {
 6938                    "type": "python",
 6939                    "name": "transcripts_json",
 6940                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
 6941                    "available": True,
 6942                    "function_name": "calculation_transcripts_annotation",
 6943                    "function_params": ["transcripts_json", None],
 6944                },
 6945                "transcripts_ann": {
 6946                    "type": "python",
 6947                    "name": "transcripts_ann",
 6948                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
 6949                    "available": True,
 6950                    "function_name": "calculation_transcripts_annotation",
 6951                    "function_params": [None, "transcripts_ann"],
 6952                },
 6953                "transcripts_annotations": {
 6954                    "type": "python",
 6955                    "name": "transcripts_annotations",
 6956                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
 6957                    "available": True,
 6958                    "function_name": "calculation_transcripts_annotation",
 6959                    "function_params": [None, None],
 6960                },
 6961                "transcripts_prioritization": {
 6962                    "type": "python",
 6963                    "name": "transcripts_prioritization",
 6964                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
 6965                    "available": True,
 6966                    "function_name": "calculation_transcripts_prioritization",
 6967                    "function_params": [],
 6968                },
 6969                "transcripts_export": {
 6970                    "type": "python",
 6971                    "name": "transcripts_export",
 6972                    "description": "Export transcripts table/view as a file (using param.json)",
 6973                    "available": True,
 6974                    "function_name": "calculation_transcripts_export",
 6975                    "function_params": [],
 6976                },
 6977            },
 6978            "prioritizations": {
 6979                "default": {
 6980                    "ANN2": [
 6981                        {
 6982                            "type": "contains",
 6983                            "value": "HIGH",
 6984                            "score": 5,
 6985                            "flag": "PASS",
 6986                            "comment": [
 6987                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
 6988                            ],
 6989                        },
 6990                        {
 6991                            "type": "contains",
 6992                            "value": "MODERATE",
 6993                            "score": 3,
 6994                            "flag": "PASS",
 6995                            "comment": [
 6996                                "A non-disruptive variant that might change protein effectiveness"
 6997                            ],
 6998                        },
 6999                        {
 7000                            "type": "contains",
 7001                            "value": "LOW",
 7002                            "score": 0,
 7003                            "flag": "FILTERED",
 7004                            "comment": [
 7005                                "Assumed to be mostly harmless or unlikely to change protein behavior"
 7006                            ],
 7007                        },
 7008                        {
 7009                            "type": "contains",
 7010                            "value": "MODIFIER",
 7011                            "score": 0,
 7012                            "flag": "FILTERED",
 7013                            "comment": [
 7014                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
 7015                            ],
 7016                        },
 7017                    ],
 7018                }
 7019            },
 7020        }
 7021
 7022        return config_default.get(name, None)
 7023
 7024    def get_config_json(
 7025        self, name: str, config_dict: dict = {}, config_file: str = None
 7026    ) -> dict:
 7027        """
 7028        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
 7029        default values, a dictionary, and a file.
 7030
 7031        :param name: The `name` parameter in the `get_config_json` function is a string that represents
 7032        the name of the configuration. It is used to identify and retrieve the configuration settings
 7033        for a specific component or module
 7034        :type name: str
 7035        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
 7036        dictionary that allows you to provide additional configuration settings or overrides. When you
 7037        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
 7038        the key is the configuration setting you want to override or
 7039        :type config_dict: dict
 7040        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
 7041        specify the path to a configuration file that contains additional settings. If provided, the
 7042        function will read the contents of this file and update the configuration dictionary with the
 7043        values found in the file, overriding any existing values with the
 7044        :type config_file: str
 7045        :return: The function `get_config_json` returns a dictionary containing the configuration
 7046        settings.
 7047        """
 7048
 7049        # Create with default prioritizations
 7050        config_default = self.get_config_default(name=name)
 7051        configuration = config_default
 7052        # log.debug(f"configuration={configuration}")
 7053
 7054        # Replace prioritizations from dict
 7055        for config in config_dict:
 7056            configuration[config] = config_dict[config]
 7057
 7058        # Replace prioritizations from file
 7059        config_file = full_path(config_file)
 7060        if config_file:
 7061            if os.path.exists(config_file):
 7062                with open(config_file) as config_file_content:
 7063                    config_file_dict = yaml.safe_load(config_file_content)
 7064                for config in config_file_dict:
 7065                    configuration[config] = config_file_dict[config]
 7066            else:
 7067                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
 7068                log.error(msg_error)
 7069                raise ValueError(msg_error)
 7070
 7071        return configuration
 7072
 7073    def prioritization(
 7074        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
 7075    ) -> bool:
 7076        """
 7077        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
 7078        prioritizes variants based on configured profiles and criteria.
 7079
 7080        :param table: The `table` parameter in the `prioritization` function is used to specify the name
 7081        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
 7082        a table name is provided, the method will prioritize the variants in that specific table
 7083        :type table: str
 7084        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
 7085        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
 7086        provided, the code will use a default prefix value of "PZ"
 7087        :type pz_prefix: str
 7088        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
 7089        additional parameters specific to the prioritization process. These parameters can include
 7090        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
 7091        configurations needed for the prioritization of variants in a V
 7092        :type pz_param: dict
 7093        :return: A boolean value (True) is being returned from the `prioritization` function.
 7094        """
 7095
 7096        # Config
 7097        config = self.get_config()
 7098
 7099        # Param
 7100        param = self.get_param()
 7101
 7102        # Prioritization param
 7103        if pz_param is not None:
 7104            prioritization_param = pz_param
 7105        else:
 7106            prioritization_param = param.get("prioritization", {})
 7107
 7108        # Configuration profiles
 7109        prioritization_config_file = prioritization_param.get(
 7110            "prioritization_config", None
 7111        )
 7112        prioritization_config_file = full_path(prioritization_config_file)
 7113        prioritizations_config = self.get_config_json(
 7114            name="prioritizations", config_file=prioritization_config_file
 7115        )
 7116
 7117        # Prioritization prefix
 7118        pz_prefix_default = "PZ"
 7119        if pz_prefix is None:
 7120            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
 7121
 7122        # Prioritization options
 7123        profiles = prioritization_param.get("profiles", [])
 7124        if isinstance(profiles, str):
 7125            profiles = profiles.split(",")
 7126        pzfields = prioritization_param.get(
 7127            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
 7128        )
 7129        if isinstance(pzfields, str):
 7130            pzfields = pzfields.split(",")
 7131        default_profile = prioritization_param.get("default_profile", None)
 7132        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
 7133        prioritization_score_mode = prioritization_param.get(
 7134            "prioritization_score_mode", "HOWARD"
 7135        )
 7136
 7137        # Quick Prioritizations
 7138        prioritizations = param.get("prioritizations", None)
 7139        if prioritizations:
 7140            log.info("Quick Prioritization:")
 7141            for profile in prioritizations.split(","):
 7142                if profile not in profiles:
 7143                    profiles.append(profile)
 7144                    log.info(f"   {profile}")
 7145
 7146        # If profile "ALL" provided, all profiles in the config profiles
 7147        if "ALL" in profiles:
 7148            profiles = list(prioritizations_config.keys())
 7149
 7150        for profile in profiles:
 7151            if prioritizations_config.get(profile, None):
 7152                log.debug(f"Profile '{profile}' configured")
 7153            else:
 7154                msg_error = f"Profile '{profile}' NOT configured"
 7155                log.error(msg_error)
 7156                raise ValueError(msg_error)
 7157
 7158        if profiles:
 7159            log.info(f"Prioritization... ")
 7160        else:
 7161            log.debug(f"No profile defined")
 7162            return False
 7163
 7164        if not default_profile and len(profiles):
 7165            default_profile = profiles[0]
 7166
 7167        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
 7168        log.debug("Profiles to check: " + str(list(profiles)))
 7169
 7170        # Variables
 7171        if table is not None:
 7172            table_variants = table
 7173        else:
 7174            table_variants = self.get_table_variants(clause="update")
 7175        log.debug(f"Table to prioritize: {table_variants}")
 7176
 7177        # Added columns
 7178        added_columns = []
 7179
 7180        # Create list of PZfields
 7181        # List of PZFields
 7182        list_of_pzfields_original = pzfields + [
 7183            pzfield + pzfields_sep + profile
 7184            for pzfield in pzfields
 7185            for profile in profiles
 7186        ]
 7187        list_of_pzfields = []
 7188        log.debug(f"{list_of_pzfields_original}")
 7189
 7190        # Remove existing PZfields to use if exists
 7191        for pzfield in list_of_pzfields_original:
 7192            if self.get_header().infos.get(pzfield, None) is None:
 7193                list_of_pzfields.append(pzfield)
 7194                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
 7195            else:
 7196                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
 7197
 7198        if list_of_pzfields:
 7199
 7200            # Explode Infos prefix
 7201            explode_infos_prefix = self.get_explode_infos_prefix()
 7202
 7203            # PZfields tags description
 7204            PZfields_INFOS = {
 7205                f"{pz_prefix}Tags": {
 7206                    "ID": f"{pz_prefix}Tags",
 7207                    "Number": ".",
 7208                    "Type": "String",
 7209                    "Description": "Variant tags based on annotation criteria",
 7210                },
 7211                f"{pz_prefix}Score": {
 7212                    "ID": f"{pz_prefix}Score",
 7213                    "Number": 1,
 7214                    "Type": "Integer",
 7215                    "Description": "Variant score based on annotation criteria",
 7216                },
 7217                f"{pz_prefix}Flag": {
 7218                    "ID": f"{pz_prefix}Flag",
 7219                    "Number": 1,
 7220                    "Type": "String",
 7221                    "Description": "Variant flag based on annotation criteria",
 7222                },
 7223                f"{pz_prefix}Comment": {
 7224                    "ID": f"{pz_prefix}Comment",
 7225                    "Number": ".",
 7226                    "Type": "String",
 7227                    "Description": "Variant comment based on annotation criteria",
 7228                },
 7229                f"{pz_prefix}Infos": {
 7230                    "ID": f"{pz_prefix}Infos",
 7231                    "Number": ".",
 7232                    "Type": "String",
 7233                    "Description": "Variant infos based on annotation criteria",
 7234                },
 7235                f"{pz_prefix}Class": {
 7236                    "ID": f"{pz_prefix}Class",
 7237                    "Number": ".",
 7238                    "Type": "String",
 7239                    "Description": "Variant class based on annotation criteria",
 7240                },
 7241            }
 7242
 7243            # Create INFO fields if not exist
 7244            for field in PZfields_INFOS:
 7245                field_ID = PZfields_INFOS[field]["ID"]
 7246                field_description = PZfields_INFOS[field]["Description"]
 7247                if field_ID not in self.get_header().infos and field_ID in pzfields:
 7248                    field_description = (
 7249                        PZfields_INFOS[field]["Description"]
 7250                        + f", profile {default_profile}"
 7251                    )
 7252                    self.get_header().infos[field_ID] = vcf.parser._Info(
 7253                        field_ID,
 7254                        PZfields_INFOS[field]["Number"],
 7255                        PZfields_INFOS[field]["Type"],
 7256                        field_description,
 7257                        "unknown",
 7258                        "unknown",
 7259                        code_type_map[PZfields_INFOS[field]["Type"]],
 7260                    )
 7261
 7262            # Create INFO fields if not exist for each profile
 7263            for profile in prioritizations_config:
 7264                if profile in profiles or profiles == []:
 7265                    for field in PZfields_INFOS:
 7266                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
 7267                        field_description = (
 7268                            PZfields_INFOS[field]["Description"]
 7269                            + f", profile {profile}"
 7270                        )
 7271                        if (
 7272                            field_ID not in self.get_header().infos
 7273                            and field in pzfields
 7274                        ):
 7275                            self.get_header().infos[field_ID] = vcf.parser._Info(
 7276                                field_ID,
 7277                                PZfields_INFOS[field]["Number"],
 7278                                PZfields_INFOS[field]["Type"],
 7279                                field_description,
 7280                                "unknown",
 7281                                "unknown",
 7282                                code_type_map[PZfields_INFOS[field]["Type"]],
 7283                            )
 7284
 7285            # Header
 7286            for pzfield in list_of_pzfields:
 7287                if re.match(f"{pz_prefix}Score.*", pzfield):
 7288                    added_column = self.add_column(
 7289                        table_name=table_variants,
 7290                        column_name=pzfield,
 7291                        column_type="INTEGER",
 7292                        default_value="0",
 7293                    )
 7294                elif re.match(f"{pz_prefix}Flag.*", pzfield):
 7295                    added_column = self.add_column(
 7296                        table_name=table_variants,
 7297                        column_name=pzfield,
 7298                        column_type="BOOLEAN",
 7299                        default_value="1",
 7300                    )
 7301                elif re.match(f"{pz_prefix}Class.*", pzfield):
 7302                    added_column = self.add_column(
 7303                        table_name=table_variants,
 7304                        column_name=pzfield,
 7305                        column_type="VARCHAR[]",
 7306                        default_value="null",
 7307                    )
 7308                else:
 7309                    added_column = self.add_column(
 7310                        table_name=table_variants,
 7311                        column_name=pzfield,
 7312                        column_type="STRING",
 7313                        default_value="''",
 7314                    )
 7315                added_columns.append(added_column)
 7316
 7317            # Profiles
 7318            if profiles:
 7319
 7320                # foreach profile in configuration file
 7321                for profile in prioritizations_config:
 7322
 7323                    # If profile is asked in param, or ALL are asked (empty profile [])
 7324                    if profile in profiles or profiles == []:
 7325                        log.info(f"Profile '{profile}'")
 7326
 7327                        sql_set_info_option = ""
 7328
 7329                        sql_set_info = []
 7330
 7331                        # PZ fields set
 7332
 7333                        # PZScore
 7334                        if (
 7335                            f"{pz_prefix}Score{pzfields_sep}{profile}"
 7336                            in list_of_pzfields
 7337                        ):
 7338                            sql_set_info.append(
 7339                                f"""
 7340                                    concat(
 7341                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
 7342                                        {pz_prefix}Score{pzfields_sep}{profile}
 7343                                    ) 
 7344                                """
 7345                            )
 7346                            if (
 7347                                profile == default_profile
 7348                                and f"{pz_prefix}Score" in list_of_pzfields
 7349                            ):
 7350                                sql_set_info.append(
 7351                                    f"""
 7352                                        concat(
 7353                                            '{pz_prefix}Score=',
 7354                                            {pz_prefix}Score{pzfields_sep}{profile}
 7355                                        )
 7356                                    """
 7357                                )
 7358
 7359                        # PZFlag
 7360                        if (
 7361                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7362                            in list_of_pzfields
 7363                        ):
 7364                            sql_set_info.append(
 7365                                f"""
 7366                                    concat(
 7367                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
 7368                                        CASE 
 7369                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7370                                            THEN 'PASS'
 7371                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7372                                            THEN 'FILTERED'
 7373                                        END
 7374                                    ) 
 7375                                """
 7376                            )
 7377                            if (
 7378                                profile == default_profile
 7379                                and f"{pz_prefix}Flag" in list_of_pzfields
 7380                            ):
 7381                                sql_set_info.append(
 7382                                    f"""
 7383                                        concat(
 7384                                            '{pz_prefix}Flag=',
 7385                                            CASE 
 7386                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7387                                                THEN 'PASS'
 7388                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7389                                                THEN 'FILTERED'
 7390                                            END
 7391                                        )
 7392                                    """
 7393                                )
 7394
 7395                        # PZClass
 7396                        if (
 7397                            f"{pz_prefix}Class{pzfields_sep}{profile}"
 7398                            in list_of_pzfields
 7399                        ):
 7400                            sql_set_info.append(
 7401                                f"""
 7402                                    concat(
 7403                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
 7404                                        CASE
 7405                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7406                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7407                                            ELSE '.'
 7408                                        END 
 7409                                    )
 7410                                    
 7411                                """
 7412                            )
 7413                            if (
 7414                                profile == default_profile
 7415                                and f"{pz_prefix}Class" in list_of_pzfields
 7416                            ):
 7417                                sql_set_info.append(
 7418                                    f"""
 7419                                        concat(
 7420                                            '{pz_prefix}Class=',
 7421                                            CASE
 7422                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7423                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7424                                                ELSE '.'
 7425                                            END 
 7426                                        )
 7427                                    """
 7428                                )
 7429
 7430                        # PZComment
 7431                        if (
 7432                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7433                            in list_of_pzfields
 7434                        ):
 7435                            sql_set_info.append(
 7436                                f"""
 7437                                    CASE
 7438                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7439                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
 7440                                        ELSE ''
 7441                                    END
 7442                                """
 7443                            )
 7444                            if (
 7445                                profile == default_profile
 7446                                and f"{pz_prefix}Comment" in list_of_pzfields
 7447                            ):
 7448                                sql_set_info.append(
 7449                                    f"""
 7450                                        CASE
 7451                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7452                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
 7453                                            ELSE ''
 7454                                        END
 7455                                    """
 7456                                )
 7457
 7458                        # PZInfos
 7459                        if (
 7460                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7461                            in list_of_pzfields
 7462                        ):
 7463                            sql_set_info.append(
 7464                                f"""
 7465                                    CASE
 7466                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7467                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
 7468                                        ELSE ''
 7469                                    END
 7470                                """
 7471                            )
 7472                            if (
 7473                                profile == default_profile
 7474                                and f"{pz_prefix}Infos" in list_of_pzfields
 7475                            ):
 7476                                sql_set_info.append(
 7477                                    f"""
 7478                                        CASE
 7479                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7480                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
 7481                                            ELSE ''
 7482                                        END
 7483                                    """
 7484                                )
 7485
 7486                        # Merge PZfields
 7487                        sql_set_info_option = ""
 7488                        sql_set_sep = ""
 7489                        for sql_set in sql_set_info:
 7490                            if sql_set_sep:
 7491                                sql_set_info_option += f"""
 7492                                    , concat('{sql_set_sep}', {sql_set})
 7493                                """
 7494                            else:
 7495                                sql_set_info_option += f"""
 7496                                    , {sql_set}
 7497                                """
 7498                            sql_set_sep = ";"
 7499
 7500                        sql_queries = []
 7501                        for annotation in prioritizations_config[profile]:
 7502
 7503                            # skip special sections
 7504                            if annotation.startswith("_"):
 7505                                continue
 7506
 7507                            # For each criterions
 7508                            for criterion in prioritizations_config[profile][
 7509                                annotation
 7510                            ]:
 7511
 7512                                # Criterion mode
 7513                                criterion_mode = None
 7514                                if np.any(
 7515                                    np.isin(list(criterion.keys()), ["type", "value"])
 7516                                ):
 7517                                    criterion_mode = "operation"
 7518                                elif np.any(
 7519                                    np.isin(list(criterion.keys()), ["sql", "fields"])
 7520                                ):
 7521                                    criterion_mode = "sql"
 7522                                log.debug(f"Criterion Mode: {criterion_mode}")
 7523
 7524                                # Criterion parameters
 7525                                criterion_type = criterion.get("type", None)
 7526                                criterion_value = criterion.get("value", None)
 7527                                criterion_sql = criterion.get("sql", None)
 7528                                criterion_fields = criterion.get("fields", None)
 7529                                criterion_score = criterion.get("score", 0)
 7530                                criterion_flag = criterion.get("flag", "PASS")
 7531                                criterion_class = criterion.get("class", None)
 7532                                criterion_flag_bool = criterion_flag == "PASS"
 7533                                criterion_comment = (
 7534                                    ", ".join(criterion.get("comment", []))
 7535                                    .replace("'", "''")
 7536                                    .replace(";", ",")
 7537                                    .replace("\t", " ")
 7538                                )
 7539                                criterion_infos = (
 7540                                    str(criterion)
 7541                                    .replace("'", "''")
 7542                                    .replace(";", ",")
 7543                                    .replace("\t", " ")
 7544                                )
 7545
 7546                                # SQL
 7547                                if criterion_sql is not None and isinstance(
 7548                                    criterion_sql, list
 7549                                ):
 7550                                    criterion_sql = " ".join(criterion_sql)
 7551
 7552                                # Fields and explode
 7553                                if criterion_fields is None:
 7554                                    criterion_fields = [annotation]
 7555                                if not isinstance(criterion_fields, list):
 7556                                    criterion_fields = str(criterion_fields).split(",")
 7557
 7558                                # Class
 7559                                if criterion_class is not None and not isinstance(
 7560                                    criterion_class, list
 7561                                ):
 7562                                    criterion_class = str(criterion_class).split(",")
 7563
 7564                                for annotation_field in criterion_fields:
 7565
 7566                                    # Explode specific annotation
 7567                                    log.debug(
 7568                                        f"Explode annotation '{annotation_field}'"
 7569                                    )
 7570                                    added_columns += self.explode_infos(
 7571                                        prefix=explode_infos_prefix,
 7572                                        fields=[annotation_field],
 7573                                        table=table_variants,
 7574                                    )
 7575                                    extra_infos = self.get_extra_infos(
 7576                                        table=table_variants
 7577                                    )
 7578
 7579                                    # Check if annotation field is present
 7580                                    if (
 7581                                        f"{explode_infos_prefix}{annotation_field}"
 7582                                        not in extra_infos
 7583                                    ):
 7584                                        msq_err = f"Annotation '{annotation_field}' not in data"
 7585                                        log.error(msq_err)
 7586                                        raise ValueError(msq_err)
 7587                                    else:
 7588                                        log.debug(
 7589                                            f"Annotation '{annotation_field}' in data"
 7590                                        )
 7591
 7592                                sql_set = []
 7593                                sql_set_info = []
 7594
 7595                                # PZ fields set
 7596
 7597                                # PZScore
 7598                                if (
 7599                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
 7600                                    in list_of_pzfields
 7601                                ):
 7602                                    # VaRank prioritization score mode
 7603                                    if prioritization_score_mode.upper().strip() in [
 7604                                        "VARANK",
 7605                                        "MAX",
 7606                                        "MAXIMUM",
 7607                                        "TOP",
 7608                                    ]:
 7609                                        sql_set.append(
 7610                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END "
 7611                                        )
 7612                                    # default HOWARD prioritization score mode
 7613                                    else:
 7614                                        sql_set.append(
 7615                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7616                                        )
 7617
 7618                                # PZFlag
 7619                                if (
 7620                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7621                                    in list_of_pzfields
 7622                                ):
 7623                                    sql_set.append(
 7624                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
 7625                                    )
 7626
 7627                                # PZClass
 7628                                if (
 7629                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
 7630                                    in list_of_pzfields
 7631                                    and criterion_class is not None
 7632                                ):
 7633                                    sql_set.append(
 7634                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
 7635                                    )
 7636
 7637                                # PZComment
 7638                                if (
 7639                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7640                                    in list_of_pzfields
 7641                                ):
 7642                                    sql_set.append(
 7643                                        f"""
 7644                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
 7645                                                concat(
 7646                                                    {pz_prefix}Comment{pzfields_sep}{profile},
 7647                                                    CASE 
 7648                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
 7649                                                        THEN ', '
 7650                                                        ELSE ''
 7651                                                    END,
 7652                                                    '{criterion_comment}'
 7653                                                )
 7654                                        """
 7655                                    )
 7656
 7657                                # PZInfos
 7658                                if (
 7659                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7660                                    in list_of_pzfields
 7661                                ):
 7662                                    sql_set.append(
 7663                                        f"""
 7664                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
 7665                                                concat(
 7666                                                    {pz_prefix}Infos{pzfields_sep}{profile},
 7667                                                    '{criterion_infos}'
 7668                                                )
 7669                                        """
 7670                                    )
 7671                                sql_set_option = ",".join(sql_set)
 7672
 7673                                # Criterion and comparison
 7674                                if sql_set_option:
 7675
 7676                                    if criterion_mode in ["operation"]:
 7677
 7678                                        try:
 7679                                            float(criterion_value)
 7680                                            sql_update = f"""
 7681                                                UPDATE {table_variants}
 7682                                                SET {sql_set_option}
 7683                                                WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
 7684                                                AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
 7685                                            """
 7686                                        except:
 7687                                            contains_option = ""
 7688                                            if criterion_type == "contains":
 7689                                                contains_option = ".*"
 7690                                            sql_update = f"""
 7691                                                UPDATE {table_variants}
 7692                                                SET {sql_set_option}
 7693                                                WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
 7694                                            """
 7695                                        sql_queries.append(sql_update)
 7696
 7697                                    elif criterion_mode in ["sql"]:
 7698
 7699                                        sql_update = f"""
 7700                                            UPDATE {table_variants}
 7701                                            SET {sql_set_option}
 7702                                            WHERE {criterion_sql}
 7703                                        """
 7704                                        sql_queries.append(sql_update)
 7705
 7706                                    else:
 7707                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
 7708                                        log.error(msg_err)
 7709                                        raise ValueError(msg_err)
 7710
 7711                                else:
 7712                                    log.warning(
 7713                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
 7714                                    )
 7715
 7716                        # PZTags
 7717                        if (
 7718                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
 7719                            in list_of_pzfields
 7720                        ):
 7721
 7722                            # Create PZFalgs value
 7723                            pztags_value = ""
 7724                            pztags_sep_default = ","
 7725                            pztags_sep = ""
 7726                            for pzfield in pzfields:
 7727                                if pzfield not in [f"{pz_prefix}Tags"]:
 7728                                    if (
 7729                                        f"{pzfield}{pzfields_sep}{profile}"
 7730                                        in list_of_pzfields
 7731                                    ):
 7732                                        if pzfield in [f"{pz_prefix}Flag"]:
 7733                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7734                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
 7735                                                    THEN 'PASS'
 7736                                                    ELSE 'FILTERED'
 7737                                                END, '"""
 7738                                        elif pzfield in [f"{pz_prefix}Class"]:
 7739                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7740                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7741                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7742                                                    ELSE '.'
 7743                                                END, '"""
 7744                                        else:
 7745                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
 7746                                        pztags_sep = pztags_sep_default
 7747
 7748                            # Add Query update for PZFlags
 7749                            sql_update_pztags = f"""
 7750                                UPDATE {table_variants}
 7751                                SET INFO = concat(
 7752                                        INFO,
 7753                                        CASE WHEN INFO NOT in ('','.')
 7754                                                THEN ';'
 7755                                                ELSE ''
 7756                                        END,
 7757                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
 7758                                    )
 7759                                """
 7760                            sql_queries.append(sql_update_pztags)
 7761
 7762                            # Add Query update for PZFlags for default
 7763                            if profile == default_profile:
 7764                                sql_update_pztags_default = f"""
 7765                                UPDATE {table_variants}
 7766                                SET INFO = concat(
 7767                                        INFO,
 7768                                        ';',
 7769                                        '{pz_prefix}Tags={pztags_value}'
 7770                                    )
 7771                                """
 7772                                sql_queries.append(sql_update_pztags_default)
 7773
 7774                        log.info(f"""Profile '{profile}' - Prioritization... """)
 7775
 7776                        if sql_queries:
 7777
 7778                            for sql_query in sql_queries:
 7779                                log.debug(
 7780                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
 7781                                )
 7782                                self.conn.execute(sql_query)
 7783
 7784                        log.info(f"""Profile '{profile}' - Update... """)
 7785                        sql_query_update = f"""
 7786                            UPDATE {table_variants}
 7787                            SET INFO =  
 7788                                concat(
 7789                                    CASE
 7790                                        WHEN INFO NOT IN ('','.')
 7791                                        THEN concat(INFO, ';')
 7792                                        ELSE ''
 7793                                    END
 7794                                    {sql_set_info_option}
 7795                                )
 7796                        """
 7797                        self.conn.execute(sql_query_update)
 7798
 7799        else:
 7800
 7801            log.warning(f"No profiles in parameters")
 7802
 7803        # Remove added columns
 7804        for added_column in added_columns:
 7805            self.drop_column(column=added_column)
 7806
 7807        # Explode INFOS fields into table fields
 7808        if self.get_explode_infos():
 7809            self.explode_infos(
 7810                prefix=self.get_explode_infos_prefix(),
 7811                fields=self.get_explode_infos_fields(),
 7812                force=True,
 7813            )
 7814
 7815        return True
 7816
 7817    ###
 7818    # HGVS
 7819    ###
 7820
 7821    def annotation_hgvs(self, threads: int = None) -> None:
 7822        """
 7823        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
 7824        coordinates and alleles.
 7825
 7826        :param threads: The `threads` parameter is an optional integer that specifies the number of
 7827        threads to use for parallel processing. If no value is provided, it will default to the number
 7828        of threads obtained from the `get_threads()` method
 7829        :type threads: int
 7830        """
 7831
 7832        # Function for each partition of the Dask Dataframe
 7833        def partition_function(partition):
 7834            """
 7835            The function `partition_function` applies the `annotation_hgvs_partition` function to
 7836            each row of a DataFrame called `partition`.
 7837
 7838            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
 7839            to be processed
 7840            :return: the result of applying the "annotation_hgvs_partition" function to each row of
 7841            the "partition" dataframe along the axis 1.
 7842            """
 7843            return partition.apply(annotation_hgvs_partition, axis=1)
 7844
 7845        def annotation_hgvs_partition(row) -> str:
 7846            """
 7847            The function `annotation_hgvs_partition` takes in a row of data and returns a string
 7848            containing a list of HGVS names associated with the given genomic coordinates and alleles.
 7849
 7850            :param row: A dictionary-like object that contains the values for the following keys:
 7851            :return: a string that contains the HGVS names associated with the given row of data.
 7852            """
 7853
 7854            chr = row["CHROM"]
 7855            pos = row["POS"]
 7856            ref = row["REF"]
 7857            alt = row["ALT"]
 7858
 7859            # Find list of associated transcripts
 7860            transcripts_list = list(
 7861                polars_conn.execute(
 7862                    f"""
 7863                SELECT transcript
 7864                FROM refseq_df
 7865                WHERE CHROM='{chr}'
 7866                AND POS={pos}
 7867            """
 7868                )["transcript"]
 7869            )
 7870
 7871            # Full HGVS annotation in list
 7872            hgvs_full_list = []
 7873
 7874            for transcript_name in transcripts_list:
 7875
 7876                # Transcript
 7877                transcript = get_transcript(
 7878                    transcripts=transcripts, transcript_name=transcript_name
 7879                )
 7880                # Exon
 7881                if use_exon:
 7882                    exon = transcript.find_exon_number(pos)
 7883                else:
 7884                    exon = None
 7885                # Protein
 7886                transcript_protein = None
 7887                if use_protein or add_protein or full_format:
 7888                    transcripts_protein = list(
 7889                        polars_conn.execute(
 7890                            f"""
 7891                        SELECT protein
 7892                        FROM refseqlink_df
 7893                        WHERE transcript='{transcript_name}'
 7894                        LIMIT 1
 7895                    """
 7896                        )["protein"]
 7897                    )
 7898                    if len(transcripts_protein):
 7899                        transcript_protein = transcripts_protein[0]
 7900
 7901                # HGVS name
 7902                hgvs_name = format_hgvs_name(
 7903                    chr,
 7904                    pos,
 7905                    ref,
 7906                    alt,
 7907                    genome=genome,
 7908                    transcript=transcript,
 7909                    transcript_protein=transcript_protein,
 7910                    exon=exon,
 7911                    use_gene=use_gene,
 7912                    use_protein=use_protein,
 7913                    full_format=full_format,
 7914                    use_version=use_version,
 7915                    codon_type=codon_type,
 7916                )
 7917                hgvs_full_list.append(hgvs_name)
 7918                if add_protein and not use_protein and not full_format:
 7919                    hgvs_name = format_hgvs_name(
 7920                        chr,
 7921                        pos,
 7922                        ref,
 7923                        alt,
 7924                        genome=genome,
 7925                        transcript=transcript,
 7926                        transcript_protein=transcript_protein,
 7927                        exon=exon,
 7928                        use_gene=use_gene,
 7929                        use_protein=True,
 7930                        full_format=False,
 7931                        use_version=use_version,
 7932                        codon_type=codon_type,
 7933                    )
 7934                    hgvs_full_list.append(hgvs_name)
 7935
 7936            # Create liste of HGVS annotations
 7937            hgvs_full = ",".join(hgvs_full_list)
 7938
 7939            return hgvs_full
 7940
 7941        # Polars connexion
 7942        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7943
 7944        # Config
 7945        config = self.get_config()
 7946
 7947        # Databases
 7948        # Genome
 7949        databases_genomes_folders = (
 7950            config.get("folders", {})
 7951            .get("databases", {})
 7952            .get("genomes", DEFAULT_GENOME_FOLDER)
 7953        )
 7954        databases_genome = (
 7955            config.get("folders", {}).get("databases", {}).get("genomes", "")
 7956        )
 7957        # refseq database folder
 7958        databases_refseq_folders = (
 7959            config.get("folders", {})
 7960            .get("databases", {})
 7961            .get("refseq", DEFAULT_REFSEQ_FOLDER)
 7962        )
 7963        # refseq
 7964        databases_refseq = config.get("databases", {}).get("refSeq", None)
 7965        # refSeqLink
 7966        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
 7967
 7968        # Param
 7969        param = self.get_param()
 7970
 7971        # Quick HGVS
 7972        if "hgvs_options" in param and param.get("hgvs_options", ""):
 7973            log.info(f"Quick HGVS Annotation:")
 7974            if not param.get("hgvs", None):
 7975                param["hgvs"] = {}
 7976            for option in param.get("hgvs_options", "").split(","):
 7977                option_var_val = option.split("=")
 7978                option_var = option_var_val[0]
 7979                if len(option_var_val) > 1:
 7980                    option_val = option_var_val[1]
 7981                else:
 7982                    option_val = "True"
 7983                if option_val.upper() in ["TRUE"]:
 7984                    option_val = True
 7985                elif option_val.upper() in ["FALSE"]:
 7986                    option_val = False
 7987                log.info(f"   {option_var}={option_val}")
 7988                param["hgvs"][option_var] = option_val
 7989
 7990        # Check if HGVS annotation enabled
 7991        if "hgvs" in param:
 7992            log.info(f"HGVS Annotation... ")
 7993            for hgvs_option in param.get("hgvs", {}):
 7994                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
 7995        else:
 7996            return
 7997
 7998        # HGVS Param
 7999        param_hgvs = param.get("hgvs", {})
 8000        use_exon = param_hgvs.get("use_exon", False)
 8001        use_gene = param_hgvs.get("use_gene", False)
 8002        use_protein = param_hgvs.get("use_protein", False)
 8003        add_protein = param_hgvs.get("add_protein", False)
 8004        full_format = param_hgvs.get("full_format", False)
 8005        use_version = param_hgvs.get("use_version", False)
 8006        codon_type = param_hgvs.get("codon_type", "3")
 8007
 8008        # refSseq refSeqLink
 8009        databases_refseq = param_hgvs.get("refseq", databases_refseq)
 8010        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
 8011
 8012        # Assembly
 8013        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 8014
 8015        # Genome
 8016        genome_file = None
 8017        if find_genome(databases_genome):
 8018            genome_file = find_genome(databases_genome)
 8019        else:
 8020            genome_file = find_genome(
 8021                genome_path=databases_genomes_folders, assembly=assembly
 8022            )
 8023        log.debug("Genome: " + str(genome_file))
 8024
 8025        # refSseq
 8026        refseq_file = find_file_prefix(
 8027            input_file=databases_refseq,
 8028            prefix="ncbiRefSeq",
 8029            folder=databases_refseq_folders,
 8030            assembly=assembly,
 8031        )
 8032        log.debug("refSeq: " + str(refseq_file))
 8033
 8034        # refSeqLink
 8035        refseqlink_file = find_file_prefix(
 8036            input_file=databases_refseqlink,
 8037            prefix="ncbiRefSeqLink",
 8038            folder=databases_refseq_folders,
 8039            assembly=assembly,
 8040        )
 8041        log.debug("refSeqLink: " + str(refseqlink_file))
 8042
 8043        # Threads
 8044        if not threads:
 8045            threads = self.get_threads()
 8046        log.debug("Threads: " + str(threads))
 8047
 8048        # Variables
 8049        table_variants = self.get_table_variants(clause="update")
 8050
 8051        # Get variants SNV and InDel only
 8052        query_variants = f"""
 8053            SELECT "#CHROM" AS CHROM, POS, REF, ALT
 8054            FROM {table_variants}
 8055            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
 8056            """
 8057        df_variants = self.get_query_to_df(query_variants)
 8058
 8059        # Added columns
 8060        added_columns = []
 8061
 8062        # Add hgvs column in variants table
 8063        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
 8064        added_column = self.add_column(
 8065            table_variants, hgvs_column_name, "STRING", default_value=None
 8066        )
 8067        added_columns.append(added_column)
 8068
 8069        log.debug(f"refSeq loading...")
 8070        # refSeq in duckDB
 8071        refseq_table = get_refseq_table(
 8072            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
 8073        )
 8074        # Loading all refSeq in Dataframe
 8075        refseq_query = f"""
 8076            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
 8077            FROM {refseq_table}
 8078            JOIN df_variants ON (
 8079                {refseq_table}.chrom = df_variants.CHROM
 8080                AND {refseq_table}.txStart<=df_variants.POS
 8081                AND {refseq_table}.txEnd>=df_variants.POS
 8082            )
 8083        """
 8084        refseq_df = self.conn.query(refseq_query).pl()
 8085
 8086        if refseqlink_file:
 8087            log.debug(f"refSeqLink loading...")
 8088            # refSeqLink in duckDB
 8089            refseqlink_table = get_refseq_table(
 8090                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
 8091            )
 8092            # Loading all refSeqLink in Dataframe
 8093            protacc_column = "protAcc_with_ver"
 8094            mrnaacc_column = "mrnaAcc_with_ver"
 8095            refseqlink_query = f"""
 8096                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
 8097                FROM {refseqlink_table} 
 8098                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
 8099                WHERE protAcc_without_ver IS NOT NULL
 8100            """
 8101            # Polars Dataframe
 8102            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
 8103
 8104        # Read RefSeq transcripts into a python dict/model.
 8105        log.debug(f"Transcripts loading...")
 8106        with tempfile.TemporaryDirectory() as tmpdir:
 8107            transcripts_query = f"""
 8108                COPY (
 8109                    SELECT {refseq_table}.*
 8110                    FROM {refseq_table}
 8111                    JOIN df_variants ON (
 8112                        {refseq_table}.chrom=df_variants.CHROM
 8113                        AND {refseq_table}.txStart<=df_variants.POS
 8114                        AND {refseq_table}.txEnd>=df_variants.POS
 8115                    )
 8116                )
 8117                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
 8118            """
 8119            self.conn.query(transcripts_query)
 8120            with open(f"{tmpdir}/transcript.tsv") as infile:
 8121                transcripts = read_transcripts(infile)
 8122
 8123        # Polars connexion
 8124        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 8125
 8126        log.debug("Genome loading...")
 8127        # Read genome sequence using pyfaidx.
 8128        genome = Fasta(genome_file)
 8129
 8130        log.debug("Start annotation HGVS...")
 8131
 8132        # Create
 8133        # a Dask Dataframe from Pandas dataframe with partition as number of threads
 8134        ddf = dd.from_pandas(df_variants, npartitions=threads)
 8135
 8136        # Use dask.dataframe.apply() to apply function on each partition
 8137        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
 8138
 8139        # Convert Dask DataFrame to Pandas Dataframe
 8140        df = ddf.compute()
 8141
 8142        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
 8143        with tempfile.TemporaryDirectory() as tmpdir:
 8144            df_parquet = os.path.join(tmpdir, "df.parquet")
 8145            df.to_parquet(df_parquet)
 8146
 8147            # Update hgvs column
 8148            update_variant_query = f"""
 8149                UPDATE {table_variants}
 8150                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
 8151                FROM read_parquet('{df_parquet}') as df
 8152                WHERE variants."#CHROM" = df.CHROM
 8153                AND variants.POS = df.POS
 8154                AND variants.REF = df.REF
 8155                AND variants.ALT = df.ALT
 8156                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
 8157                """
 8158            self.execute_query(update_variant_query)
 8159
 8160        # Update INFO column
 8161        sql_query_update = f"""
 8162            UPDATE {table_variants}
 8163            SET INFO = 
 8164                concat(
 8165                    CASE 
 8166                        WHEN INFO NOT IN ('','.')
 8167                        THEN concat(INFO, ';')
 8168                        ELSE ''
 8169                    END,
 8170                    'hgvs=',
 8171                    {hgvs_column_name}
 8172                )
 8173            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
 8174            """
 8175        self.execute_query(sql_query_update)
 8176
 8177        # Add header
 8178        HGVS_INFOS = {
 8179            "hgvs": {
 8180                "ID": "hgvs",
 8181                "Number": ".",
 8182                "Type": "String",
 8183                "Description": f"HGVS annotatation with HOWARD",
 8184            }
 8185        }
 8186
 8187        for field in HGVS_INFOS:
 8188            field_ID = HGVS_INFOS[field]["ID"]
 8189            field_description = HGVS_INFOS[field]["Description"]
 8190            self.get_header().infos[field_ID] = vcf.parser._Info(
 8191                field_ID,
 8192                HGVS_INFOS[field]["Number"],
 8193                HGVS_INFOS[field]["Type"],
 8194                field_description,
 8195                "unknown",
 8196                "unknown",
 8197                code_type_map[HGVS_INFOS[field]["Type"]],
 8198            )
 8199
 8200        # Remove added columns
 8201        for added_column in added_columns:
 8202            self.drop_column(column=added_column)
 8203
 8204    ###
 8205    # Calculation
 8206    ###
 8207
 8208    def get_operations_help(
 8209        self, operations_config_dict: dict = {}, operations_config_file: str = None
 8210    ) -> list:
 8211
 8212        # Init
 8213        operations_help = []
 8214
 8215        # operations
 8216        operations = self.get_config_json(
 8217            name="calculations",
 8218            config_dict=operations_config_dict,
 8219            config_file=operations_config_file,
 8220        )
 8221        for op in operations:
 8222            op_name = operations[op].get("name", op).upper()
 8223            op_description = operations[op].get("description", op_name)
 8224            op_available = operations[op].get("available", False)
 8225            if op_available:
 8226                operations_help.append(f"   {op_name}: {op_description}")
 8227
 8228        # Sort operations
 8229        operations_help.sort()
 8230
 8231        # insert header
 8232        operations_help.insert(0, "Available calculation operations:")
 8233
 8234        # Return
 8235        return operations_help
 8236
 8237    def calculation(
 8238        self,
 8239        operations: dict = {},
 8240        operations_config_dict: dict = {},
 8241        operations_config_file: str = None,
 8242    ) -> None:
 8243        """
 8244        It takes a list of operations, and for each operation, it checks if it's a python or sql
 8245        operation, and then calls the appropriate function
 8246
 8247        param json example:
 8248            "calculation": {
 8249                "NOMEN": {
 8250                    "options": {
 8251                        "hgvs_field": "hgvs"
 8252                    },
 8253                "middle" : null
 8254            }
 8255        """
 8256
 8257        # Param
 8258        param = self.get_param()
 8259
 8260        # CHeck operations config file
 8261        if operations_config_file is None:
 8262            operations_config_file = param.get("calculation", {}).get(
 8263                "calculation_config", None
 8264            )
 8265
 8266        # operations config
 8267        operations_config = self.get_config_json(
 8268            name="calculations",
 8269            config_dict=operations_config_dict,
 8270            config_file=operations_config_file,
 8271        )
 8272
 8273        # Upper keys
 8274        operations_config = {k.upper(): v for k, v in operations_config.items()}
 8275
 8276        # Calculations
 8277
 8278        # Operations from param
 8279        operations = param.get("calculation", {}).get("calculations", operations)
 8280
 8281        # Quick calculation - add
 8282        if param.get("calculations", None):
 8283
 8284            # List of operations
 8285            calculations_list = [
 8286                value.strip() for value in param.get("calculations", "").split(",")
 8287            ]
 8288
 8289            # Log
 8290            log.info(f"Quick Calculations:")
 8291            for calculation_key in calculations_list:
 8292                log.info(f"   {calculation_key}")
 8293
 8294            # Create tmp operations (to keep operation order)
 8295            operations_tmp = {}
 8296            for calculation_operation in calculations_list:
 8297                if calculation_operation.upper() not in operations_tmp:
 8298                    log.debug(
 8299                        f"{calculation_operation}.upper() not in {operations_tmp}"
 8300                    )
 8301                    operations_tmp[calculation_operation.upper()] = {}
 8302                    add_value_into_dict(
 8303                        dict_tree=operations_tmp,
 8304                        sections=[
 8305                            calculation_operation.upper(),
 8306                        ],
 8307                        value=operations.get(calculation_operation.upper(), {}),
 8308                    )
 8309            # Add operations already in param
 8310            for calculation_operation in operations:
 8311                if calculation_operation not in operations_tmp:
 8312                    operations_tmp[calculation_operation] = operations.get(
 8313                        calculation_operation, {}
 8314                    )
 8315
 8316            # Update operations in param
 8317            operations = operations_tmp
 8318
 8319        # Operations for calculation
 8320        if not operations:
 8321            operations = param.get("calculation", {}).get("calculations", {})
 8322
 8323        if operations:
 8324            log.info(f"Calculations...")
 8325
 8326        # For each operations
 8327        for operation_name in operations:
 8328            operation_name = operation_name.upper()
 8329            if operation_name not in [""]:
 8330                if operation_name in operations_config:
 8331                    log.info(f"Calculation '{operation_name}'")
 8332                    operation = operations_config[operation_name]
 8333                    operation_type = operation.get("type", "sql")
 8334                    if operation_type == "python":
 8335                        self.calculation_process_function(
 8336                            operation=operation, operation_name=operation_name
 8337                        )
 8338                    elif operation_type == "sql":
 8339                        self.calculation_process_sql(
 8340                            operation=operation, operation_name=operation_name
 8341                        )
 8342                    else:
 8343                        log.error(
 8344                            f"Operations config: Type '{operation_type}' NOT available"
 8345                        )
 8346                        raise ValueError(
 8347                            f"Operations config: Type '{operation_type}' NOT available"
 8348                        )
 8349                else:
 8350                    log.error(
 8351                        f"Operations config: Calculation '{operation_name}' NOT available"
 8352                    )
 8353                    raise ValueError(
 8354                        f"Operations config: Calculation '{operation_name}' NOT available"
 8355                    )
 8356
 8357        # Explode INFOS fields into table fields
 8358        if self.get_explode_infos():
 8359            self.explode_infos(
 8360                prefix=self.get_explode_infos_prefix(),
 8361                fields=self.get_explode_infos_fields(),
 8362                force=True,
 8363            )
 8364
 8365    def calculation_process_sql(
 8366        self, operation: dict, operation_name: str = "unknown"
 8367    ) -> None:
 8368        """
 8369        The `calculation_process_sql` function takes in a mathematical operation as a string and
 8370        performs the operation, updating the specified table with the result.
 8371
 8372        :param operation: The `operation` parameter is a dictionary that contains information about the
 8373        mathematical operation to be performed. It includes the following keys:
 8374        :type operation: dict
 8375        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8376        the mathematical operation being performed. It is used for logging and error handling purposes,
 8377        defaults to unknown
 8378        :type operation_name: str (optional)
 8379        """
 8380
 8381        # Operation infos
 8382        operation_name = operation.get("name", "unknown")
 8383        log.debug(f"process SQL {operation_name}")
 8384        output_column_name = operation.get("output_column_name", operation_name)
 8385        output_column_type = operation.get("output_column_type", "String")
 8386        prefix = operation.get("explode_infos_prefix", "")
 8387        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
 8388        output_column_description = operation.get(
 8389            "output_column_description", f"{operation_name} operation"
 8390        )
 8391        operation_query = operation.get("operation_query", None)
 8392        if isinstance(operation_query, list):
 8393            operation_query = " ".join(operation_query)
 8394        operation_info_fields = operation.get("info_fields", [])
 8395        operation_info_fields_check = operation.get("info_fields_check", False)
 8396        operation_info = operation.get("operation_info", True)
 8397        operation_table = operation.get(
 8398            "table", self.get_table_variants(clause="alter")
 8399        )
 8400
 8401        # table variants
 8402        if operation_table:
 8403            table_variants = operation_table
 8404        else:
 8405            table_variants = self.get_table_variants(clause="alter")
 8406
 8407        if operation_query:
 8408
 8409            # Info fields check
 8410            operation_info_fields_check_result = True
 8411            if operation_info_fields_check:
 8412                header_infos = self.get_header().infos
 8413                for info_field in operation_info_fields:
 8414                    operation_info_fields_check_result = (
 8415                        operation_info_fields_check_result
 8416                        and info_field in header_infos
 8417                    )
 8418
 8419            # If info fields available
 8420            if operation_info_fields_check_result:
 8421
 8422                # Added_columns
 8423                added_columns = []
 8424
 8425                # Create VCF header field
 8426                vcf_reader = self.get_header()
 8427                vcf_reader.infos[output_column_name] = vcf.parser._Info(
 8428                    output_column_name,
 8429                    ".",
 8430                    output_column_type,
 8431                    output_column_description,
 8432                    "howard calculation",
 8433                    "0",
 8434                    self.code_type_map.get(output_column_type),
 8435                )
 8436
 8437                # Explode infos if needed
 8438                log.debug(f"calculation_process_sql prefix {prefix}")
 8439                added_columns += self.explode_infos(
 8440                    prefix=prefix,
 8441                    fields=[output_column_name] + operation_info_fields,
 8442                    force=False,
 8443                    table=table_variants,
 8444                )
 8445
 8446                # Create column
 8447                added_column = self.add_column(
 8448                    table_name=table_variants,
 8449                    column_name=prefix + output_column_name,
 8450                    column_type=output_column_type_sql,
 8451                    default_value="null",
 8452                )
 8453                added_columns.append(added_column)
 8454
 8455                # Operation calculation
 8456                try:
 8457
 8458                    # Query to update calculation column
 8459                    sql_update = f"""
 8460                        UPDATE {table_variants}
 8461                        SET "{prefix}{output_column_name}" = ({operation_query})
 8462                    """
 8463                    self.conn.execute(sql_update)
 8464
 8465                    # Add to INFO
 8466                    if operation_info:
 8467                        sql_update_info = f"""
 8468                            UPDATE {table_variants}
 8469                            SET "INFO" =
 8470                                concat(
 8471                                    CASE
 8472                                        WHEN "INFO" IS NOT NULL
 8473                                        THEN concat("INFO", ';')
 8474                                        ELSE ''
 8475                                    END,
 8476                                    '{output_column_name}=',
 8477                                    "{prefix}{output_column_name}"
 8478                                )
 8479                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
 8480                        """
 8481                        self.conn.execute(sql_update_info)
 8482
 8483                except:
 8484                    log.error(
 8485                        f"Operations config: Calculation '{operation_name}' query failed"
 8486                    )
 8487                    raise ValueError(
 8488                        f"Operations config: Calculation '{operation_name}' query failed"
 8489                    )
 8490
 8491                # Remove added columns
 8492                for added_column in added_columns:
 8493                    log.debug(f"added_column: {added_column}")
 8494                    self.drop_column(column=added_column)
 8495
 8496            else:
 8497                log.error(
 8498                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8499                )
 8500                raise ValueError(
 8501                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8502                )
 8503
 8504        else:
 8505            log.error(
 8506                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8507            )
 8508            raise ValueError(
 8509                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8510            )
 8511
 8512    def calculation_process_function(
 8513        self, operation: dict, operation_name: str = "unknown"
 8514    ) -> None:
 8515        """
 8516        The `calculation_process_function` takes in an operation dictionary and performs the specified
 8517        function with the given parameters.
 8518
 8519        :param operation: The `operation` parameter is a dictionary that contains information about the
 8520        operation to be performed. It has the following keys:
 8521        :type operation: dict
 8522        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8523        the operation being performed. It is used for logging purposes, defaults to unknown
 8524        :type operation_name: str (optional)
 8525        """
 8526
 8527        operation_name = operation["name"]
 8528        log.debug(f"process Python {operation_name}")
 8529        function_name = operation["function_name"]
 8530        function_params = operation["function_params"]
 8531        getattr(self, function_name)(*function_params)
 8532
 8533    def calculation_variant_id(self) -> None:
 8534        """
 8535        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
 8536        updates the INFO field of a variants table with the variant ID.
 8537        """
 8538
 8539        # variant_id annotation field
 8540        variant_id_tag = self.get_variant_id_column()
 8541        added_columns = [variant_id_tag]
 8542
 8543        # variant_id hgvs tags"
 8544        vcf_infos_tags = {
 8545            variant_id_tag: "howard variant ID annotation",
 8546        }
 8547
 8548        # Variants table
 8549        table_variants = self.get_table_variants()
 8550
 8551        # Header
 8552        vcf_reader = self.get_header()
 8553
 8554        # Add variant_id to header
 8555        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
 8556            variant_id_tag,
 8557            ".",
 8558            "String",
 8559            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
 8560            "howard calculation",
 8561            "0",
 8562            self.code_type_map.get("String"),
 8563        )
 8564
 8565        # Update
 8566        sql_update = f"""
 8567            UPDATE {table_variants}
 8568            SET "INFO" = 
 8569                concat(
 8570                    CASE
 8571                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8572                        THEN ''
 8573                        ELSE concat("INFO", ';')
 8574                    END,
 8575                    '{variant_id_tag}=',
 8576                    "{variant_id_tag}"
 8577                )
 8578        """
 8579        self.conn.execute(sql_update)
 8580
 8581        # Remove added columns
 8582        for added_column in added_columns:
 8583            self.drop_column(column=added_column)
 8584
 8585    def calculation_extract_snpeff_hgvs(
 8586        self,
 8587        snpeff_hgvs: str = "snpeff_hgvs",
 8588        snpeff_field: str = "ANN",
 8589    ) -> None:
 8590        """
 8591        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
 8592        annotation field in a VCF file and adds them as a new column in the variants table.
 8593
 8594        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
 8595        function is used to specify the name of the column that will store the HGVS nomenclatures
 8596        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
 8597        snpeff_hgvs
 8598        :type snpeff_hgvs: str (optional)
 8599        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
 8600        function represents the field in the VCF file that contains SnpEff annotations. This field is
 8601        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
 8602        to ANN
 8603        :type snpeff_field: str (optional)
 8604        """
 8605
 8606        # Snpeff hgvs tags
 8607        vcf_infos_tags = {
 8608            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
 8609        }
 8610
 8611        # Prefix
 8612        prefix = self.get_explode_infos_prefix()
 8613        if prefix:
 8614            prefix = "INFO/"
 8615
 8616        # snpEff fields
 8617        speff_ann_infos = prefix + snpeff_field
 8618        speff_hgvs_infos = prefix + snpeff_hgvs
 8619
 8620        # Variants table
 8621        table_variants = self.get_table_variants()
 8622
 8623        # Header
 8624        vcf_reader = self.get_header()
 8625
 8626        # Add columns
 8627        added_columns = []
 8628
 8629        # Explode HGVS field in column
 8630        added_columns += self.explode_infos(fields=[snpeff_field])
 8631
 8632        if snpeff_field in vcf_reader.infos:
 8633
 8634            log.debug(vcf_reader.infos[snpeff_field])
 8635
 8636            # Extract ANN header
 8637            ann_description = vcf_reader.infos[snpeff_field].desc
 8638            pattern = r"'(.+?)'"
 8639            match = re.search(pattern, ann_description)
 8640            if match:
 8641                ann_header_match = match.group(1).split(" | ")
 8642                ann_header_desc = {}
 8643                for i in range(len(ann_header_match)):
 8644                    ann_header_info = "".join(
 8645                        char for char in ann_header_match[i] if char.isalnum()
 8646                    )
 8647                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8648                if not ann_header_desc:
 8649                    raise ValueError("Invalid header description format")
 8650            else:
 8651                raise ValueError("Invalid header description format")
 8652
 8653            # Create variant id
 8654            variant_id_column = self.get_variant_id_column()
 8655            added_columns += [variant_id_column]
 8656
 8657            # Create dataframe
 8658            dataframe_snpeff_hgvs = self.get_query_to_df(
 8659                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8660            )
 8661
 8662            # Create main NOMEN column
 8663            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8664                speff_ann_infos
 8665            ].apply(
 8666                lambda x: extract_snpeff_hgvs(
 8667                    str(x), header=list(ann_header_desc.values())
 8668                )
 8669            )
 8670
 8671            # Add snpeff_hgvs to header
 8672            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
 8673                snpeff_hgvs,
 8674                ".",
 8675                "String",
 8676                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
 8677                "howard calculation",
 8678                "0",
 8679                self.code_type_map.get("String"),
 8680            )
 8681
 8682            # Update
 8683            sql_update = f"""
 8684                UPDATE variants
 8685                SET "INFO" = 
 8686                    concat(
 8687                        CASE
 8688                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8689                            THEN ''
 8690                            ELSE concat("INFO", ';')
 8691                        END,
 8692                        CASE 
 8693                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8694                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8695                            THEN concat(
 8696                                    '{snpeff_hgvs}=',
 8697                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8698                                )
 8699                            ELSE ''
 8700                        END
 8701                    )
 8702                FROM dataframe_snpeff_hgvs
 8703                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8704
 8705            """
 8706            self.conn.execute(sql_update)
 8707
 8708            # Delete dataframe
 8709            del dataframe_snpeff_hgvs
 8710            gc.collect()
 8711
 8712        else:
 8713
 8714            log.warning(
 8715                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8716            )
 8717
 8718        # Remove added columns
 8719        for added_column in added_columns:
 8720            self.drop_column(column=added_column)
 8721
 8722    def calculation_snpeff_ann_explode(
 8723        self,
 8724        uniquify: bool = True,
 8725        output_format: str = "fields",
 8726        output_prefix: str = "snpeff_",
 8727        snpeff_field: str = "ANN",
 8728    ) -> None:
 8729        """
 8730        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
 8731        exploding the HGVS field and updating variant information accordingly.
 8732
 8733        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
 8734        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
 8735        it indicates that the output should be unique, meaning that duplicate entries should be removed,
 8736        defaults to True
 8737        :type uniquify: bool (optional)
 8738        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
 8739        function specifies the format in which the output annotations will be generated. It has a
 8740        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
 8741        format, defaults to fields
 8742        :type output_format: str (optional)
 8743        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
 8744        method is used to specify the prefix that will be added to the output annotations generated
 8745        during the calculation process. This prefix helps to differentiate the newly added annotations
 8746        from existing ones in the output data. By default, the, defaults to ANN_
 8747        :type output_prefix: str (optional)
 8748        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
 8749        function is used to specify the field in the VCF file that contains SnpEff annotations. This
 8750        field will be processed to explode the HGVS annotations and update the variant information
 8751        accordingly, defaults to ANN
 8752        :type snpeff_field: str (optional)
 8753        """
 8754
 8755        # SnpEff annotation field
 8756        snpeff_hgvs = "snpeff_ann_explode"
 8757
 8758        # Snpeff hgvs tags
 8759        vcf_infos_tags = {
 8760            snpeff_hgvs: "Explode snpEff annotations",
 8761        }
 8762
 8763        # Prefix
 8764        prefix = self.get_explode_infos_prefix()
 8765        if prefix:
 8766            prefix = "INFO/"
 8767
 8768        # snpEff fields
 8769        speff_ann_infos = prefix + snpeff_field
 8770        speff_hgvs_infos = prefix + snpeff_hgvs
 8771
 8772        # Variants table
 8773        table_variants = self.get_table_variants()
 8774
 8775        # Header
 8776        vcf_reader = self.get_header()
 8777
 8778        # Add columns
 8779        added_columns = []
 8780
 8781        # Explode HGVS field in column
 8782        added_columns += self.explode_infos(fields=[snpeff_field])
 8783        log.debug(f"snpeff_field={snpeff_field}")
 8784        log.debug(f"added_columns={added_columns}")
 8785
 8786        if snpeff_field in vcf_reader.infos:
 8787
 8788            # Extract ANN header
 8789            ann_description = vcf_reader.infos[snpeff_field].desc
 8790            pattern = r"'(.+?)'"
 8791            match = re.search(pattern, ann_description)
 8792            if match:
 8793                ann_header_match = match.group(1).split(" | ")
 8794                ann_header = []
 8795                ann_header_desc = {}
 8796                for i in range(len(ann_header_match)):
 8797                    ann_header_info = "".join(
 8798                        char for char in ann_header_match[i] if char.isalnum()
 8799                    )
 8800                    ann_header.append(ann_header_info)
 8801                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8802                if not ann_header_desc:
 8803                    raise ValueError("Invalid header description format")
 8804            else:
 8805                raise ValueError("Invalid header description format")
 8806
 8807            # Create variant id
 8808            variant_id_column = self.get_variant_id_column()
 8809            added_columns += [variant_id_column]
 8810
 8811            # Create dataframe
 8812            dataframe_snpeff_hgvs = self.get_query_to_df(
 8813                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8814            )
 8815
 8816            # Create snpEff columns
 8817            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8818                speff_ann_infos
 8819            ].apply(
 8820                lambda x: explode_snpeff_ann(
 8821                    str(x),
 8822                    uniquify=uniquify,
 8823                    output_format=output_format,
 8824                    prefix=output_prefix,
 8825                    header=list(ann_header_desc.values()),
 8826                )
 8827            )
 8828
 8829            # Header
 8830            ann_annotations_prefix = ""
 8831            if output_format.upper() in ["JSON"]:
 8832                ann_annotations_prefix = f"{output_prefix}="
 8833                vcf_reader.infos[output_prefix] = vcf.parser._Info(
 8834                    output_prefix,
 8835                    ".",
 8836                    "String",
 8837                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8838                    + " - JSON format",
 8839                    "howard calculation",
 8840                    "0",
 8841                    self.code_type_map.get("String"),
 8842                )
 8843            else:
 8844                for ann_annotation in ann_header:
 8845                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
 8846                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
 8847                        ann_annotation_id,
 8848                        ".",
 8849                        "String",
 8850                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8851                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
 8852                        "howard calculation",
 8853                        "0",
 8854                        self.code_type_map.get("String"),
 8855                    )
 8856
 8857            # Update
 8858            sql_update = f"""
 8859                UPDATE variants
 8860                SET "INFO" = 
 8861                    concat(
 8862                        CASE
 8863                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8864                            THEN ''
 8865                            ELSE concat("INFO", ';')
 8866                        END,
 8867                        CASE 
 8868                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8869                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8870                            THEN concat(
 8871                                '{ann_annotations_prefix}',
 8872                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8873                                )
 8874                            ELSE ''
 8875                        END
 8876                    )
 8877                FROM dataframe_snpeff_hgvs
 8878                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8879
 8880            """
 8881            self.conn.execute(sql_update)
 8882
 8883            # Delete dataframe
 8884            del dataframe_snpeff_hgvs
 8885            gc.collect()
 8886
 8887        else:
 8888
 8889            log.warning(
 8890                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8891            )
 8892
 8893        # Remove added columns
 8894        for added_column in added_columns:
 8895            self.drop_column(column=added_column)
 8896
 8897    def calculation_extract_nomen(self) -> None:
 8898        """
 8899        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
 8900        """
 8901
 8902        # NOMEN field
 8903        field_nomen_dict = "NOMEN_DICT"
 8904
 8905        # NOMEN structure
 8906        nomen_dict = {
 8907            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
 8908            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
 8909            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
 8910            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
 8911            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
 8912            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
 8913            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
 8914            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
 8915            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
 8916            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
 8917        }
 8918
 8919        # Param
 8920        param = self.get_param()
 8921
 8922        # Threads
 8923        threads = self.get_threads()
 8924
 8925        # Prefix
 8926        prefix = self.get_explode_infos_prefix()
 8927
 8928        # Header
 8929        vcf_reader = self.get_header()
 8930
 8931        # Added columns
 8932        added_columns = []
 8933
 8934        # Get HGVS field
 8935        hgvs_field = (
 8936            param.get("calculation", {})
 8937            .get("calculations", {})
 8938            .get("NOMEN", {})
 8939            .get("options", {})
 8940            .get("hgvs_field", "hgvs")
 8941        )
 8942
 8943        # Get NOMEN pattern
 8944        nomen_pattern = (
 8945            param.get("calculation", {})
 8946            .get("calculations", {})
 8947            .get("NOMEN", {})
 8948            .get("options", {})
 8949            .get("pattern", None)
 8950        )
 8951
 8952        # transcripts list of preference sources
 8953        transcripts_sources = {}
 8954
 8955        # Get transcripts
 8956        transcripts_file = (
 8957            param.get("calculation", {})
 8958            .get("calculations", {})
 8959            .get("NOMEN", {})
 8960            .get("options", {})
 8961            .get("transcripts", None)
 8962        )
 8963        transcripts_file = full_path(transcripts_file)
 8964        if transcripts_file:
 8965            if os.path.exists(transcripts_file):
 8966                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
 8967                transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist()
 8968                transcripts_sources["file"] = transcripts_from_file
 8969            else:
 8970                msg_err = f"Transcript file '{transcripts_file}' does NOT exist"
 8971                log.error(msg_err)
 8972                raise ValueError(msg_err)
 8973
 8974        # Get transcripts table
 8975        transcripts_table = (
 8976            param.get("calculation", {})
 8977            .get("calculations", {})
 8978            .get("NOMEN", {})
 8979            .get("options", {})
 8980            .get("transcripts_table", self.get_table_variants())
 8981        )
 8982        # Get transcripts column
 8983        transcripts_column = (
 8984            param.get("calculation", {})
 8985            .get("calculations", {})
 8986            .get("NOMEN", {})
 8987            .get("options", {})
 8988            .get("transcripts_column", None)
 8989        )
 8990
 8991        if transcripts_table and transcripts_column:
 8992            extra_field_transcript = f"{transcripts_table}.{transcripts_column}"
 8993            # Explode if not exists
 8994            added_columns += self.explode_infos(
 8995                fields=[transcripts_column], table=transcripts_table
 8996            )
 8997        else:
 8998            extra_field_transcript = f"NULL"
 8999
 9000        # Transcripts of preference source order
 9001        transcripts_order = (
 9002            param.get("calculation", {})
 9003            .get("calculations", {})
 9004            .get("NOMEN", {})
 9005            .get("options", {})
 9006            .get("transcripts_order", ["column", "file"])
 9007        )
 9008
 9009        # Transcripts from file
 9010        transcripts = transcripts_sources.get("file", [])
 9011
 9012        # Explode HGVS field in column
 9013        added_columns += self.explode_infos(fields=[hgvs_field])
 9014
 9015        # extra infos
 9016        extra_infos = self.get_extra_infos()
 9017        extra_field = prefix + hgvs_field
 9018
 9019        if extra_field in extra_infos:
 9020
 9021            # Create dataframe
 9022            dataframe_hgvs = self.get_query_to_df(
 9023                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """
 9024            )
 9025
 9026            # Transcripts rank
 9027            transcripts_rank = {
 9028                transcript: rank for rank, transcript in enumerate(transcripts, start=1)
 9029            }
 9030            transcripts_len = len(transcripts_rank)
 9031
 9032            # Create main NOMEN column
 9033            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply(
 9034                lambda x: find_nomen(
 9035                    hgvs=x.hgvs,
 9036                    transcript=x.transcript,
 9037                    transcripts=transcripts_rank,
 9038                    pattern=nomen_pattern,
 9039                    transcripts_source_order=transcripts_order,
 9040                    transcripts_len=transcripts_len,
 9041                ),
 9042                axis=1,
 9043            )
 9044
 9045            # Explode NOMEN Structure and create SQL set for update
 9046            sql_nomen_fields = []
 9047            for nomen_field in nomen_dict:
 9048
 9049                # Create VCF header field
 9050                vcf_reader.infos[nomen_field] = vcf.parser._Info(
 9051                    nomen_field,
 9052                    ".",
 9053                    "String",
 9054                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
 9055                    "howard calculation",
 9056                    "0",
 9057                    self.code_type_map.get("String"),
 9058                )
 9059
 9060                # Add field to SQL query update
 9061                sql_nomen_fields.append(
 9062                    f"""
 9063                        CASE 
 9064                            WHEN dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT NULL AND dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT IN ('')
 9065                            THEN concat(
 9066                                    ';{nomen_field}=',
 9067                                    dataframe_hgvs."{field_nomen_dict}"."{nomen_field}"
 9068                                )
 9069                            ELSE ''
 9070                        END
 9071                    """
 9072                )
 9073
 9074            # SQL set for update
 9075            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
 9076
 9077            # Update
 9078            sql_update = f"""
 9079                UPDATE variants
 9080                SET "INFO" = 
 9081                    concat(
 9082                        CASE
 9083                            WHEN "INFO" IS NULL
 9084                            THEN ''
 9085                            ELSE "INFO"
 9086                        END,
 9087                        {sql_nomen_fields_set}
 9088                    )
 9089                FROM dataframe_hgvs
 9090                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
 9091                    AND variants."POS" = dataframe_hgvs."POS" 
 9092                    AND variants."REF" = dataframe_hgvs."REF"
 9093                    AND variants."ALT" = dataframe_hgvs."ALT"
 9094            """
 9095            self.conn.execute(sql_update)
 9096
 9097            # Delete dataframe
 9098            del dataframe_hgvs
 9099            gc.collect()
 9100
 9101        # Remove added columns
 9102        for added_column in added_columns:
 9103            self.drop_column(column=added_column)
 9104
 9105    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
 9106        """
 9107        The function `calculation_find_by_pipeline` performs a calculation to find the number of
 9108        pipeline/sample for a variant and updates the variant information in a VCF file.
 9109
 9110        :param tag: The `tag` parameter is a string that represents the annotation field for the
 9111        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
 9112        VCF header and to update the corresponding field in the variants table, defaults to
 9113        findbypipeline
 9114        :type tag: str (optional)
 9115        """
 9116
 9117        # if FORMAT and samples
 9118        if (
 9119            "FORMAT" in self.get_header_columns_as_list()
 9120            and self.get_header_sample_list()
 9121        ):
 9122
 9123            # findbypipeline annotation field
 9124            findbypipeline_tag = tag
 9125
 9126            # VCF infos tags
 9127            vcf_infos_tags = {
 9128                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
 9129            }
 9130
 9131            # Prefix
 9132            prefix = self.get_explode_infos_prefix()
 9133
 9134            # Field
 9135            findbypipeline_infos = prefix + findbypipeline_tag
 9136
 9137            # Variants table
 9138            table_variants = self.get_table_variants()
 9139
 9140            # Header
 9141            vcf_reader = self.get_header()
 9142
 9143            # Create variant id
 9144            variant_id_column = self.get_variant_id_column()
 9145            added_columns = [variant_id_column]
 9146
 9147            # variant_id, FORMAT and samples
 9148            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9149                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9150            )
 9151
 9152            # Create dataframe
 9153            dataframe_findbypipeline = self.get_query_to_df(
 9154                f""" SELECT {samples_fields} FROM {table_variants} """
 9155            )
 9156
 9157            # Create findbypipeline column
 9158            dataframe_findbypipeline[findbypipeline_infos] = (
 9159                dataframe_findbypipeline.apply(
 9160                    lambda row: findbypipeline(
 9161                        row, samples=self.get_header_sample_list()
 9162                    ),
 9163                    axis=1,
 9164                )
 9165            )
 9166
 9167            # Add snpeff_hgvs to header
 9168            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
 9169                findbypipeline_tag,
 9170                ".",
 9171                "String",
 9172                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
 9173                "howard calculation",
 9174                "0",
 9175                self.code_type_map.get("String"),
 9176            )
 9177
 9178            # Update
 9179            sql_update = f"""
 9180                UPDATE variants
 9181                SET "INFO" = 
 9182                    concat(
 9183                        CASE
 9184                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9185                            THEN ''
 9186                            ELSE concat("INFO", ';')
 9187                        END,
 9188                        CASE 
 9189                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
 9190                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
 9191                            THEN concat(
 9192                                    '{findbypipeline_tag}=',
 9193                                    dataframe_findbypipeline."{findbypipeline_infos}"
 9194                                )
 9195                            ELSE ''
 9196                        END
 9197                    )
 9198                FROM dataframe_findbypipeline
 9199                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
 9200            """
 9201            self.conn.execute(sql_update)
 9202
 9203            # Remove added columns
 9204            for added_column in added_columns:
 9205                self.drop_column(column=added_column)
 9206
 9207            # Delete dataframe
 9208            del dataframe_findbypipeline
 9209            gc.collect()
 9210
 9211    def calculation_genotype_concordance(self) -> None:
 9212        """
 9213        The function `calculation_genotype_concordance` calculates the genotype concordance for
 9214        multi-caller VCF files and updates the variant information in the database.
 9215        """
 9216
 9217        # if FORMAT and samples
 9218        if (
 9219            "FORMAT" in self.get_header_columns_as_list()
 9220            and self.get_header_sample_list()
 9221        ):
 9222
 9223            # genotypeconcordance annotation field
 9224            genotypeconcordance_tag = "genotypeconcordance"
 9225
 9226            # VCF infos tags
 9227            vcf_infos_tags = {
 9228                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
 9229            }
 9230
 9231            # Prefix
 9232            prefix = self.get_explode_infos_prefix()
 9233
 9234            # Field
 9235            genotypeconcordance_infos = prefix + genotypeconcordance_tag
 9236
 9237            # Variants table
 9238            table_variants = self.get_table_variants()
 9239
 9240            # Header
 9241            vcf_reader = self.get_header()
 9242
 9243            # Create variant id
 9244            variant_id_column = self.get_variant_id_column()
 9245            added_columns = [variant_id_column]
 9246
 9247            # variant_id, FORMAT and samples
 9248            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9249                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9250            )
 9251
 9252            # Create dataframe
 9253            dataframe_genotypeconcordance = self.get_query_to_df(
 9254                f""" SELECT {samples_fields} FROM {table_variants} """
 9255            )
 9256
 9257            # Create genotypeconcordance column
 9258            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
 9259                dataframe_genotypeconcordance.apply(
 9260                    lambda row: genotypeconcordance(
 9261                        row, samples=self.get_header_sample_list()
 9262                    ),
 9263                    axis=1,
 9264                )
 9265            )
 9266
 9267            # Add genotypeconcordance to header
 9268            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
 9269                genotypeconcordance_tag,
 9270                ".",
 9271                "String",
 9272                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
 9273                "howard calculation",
 9274                "0",
 9275                self.code_type_map.get("String"),
 9276            )
 9277
 9278            # Update
 9279            sql_update = f"""
 9280                UPDATE variants
 9281                SET "INFO" = 
 9282                    concat(
 9283                        CASE
 9284                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9285                            THEN ''
 9286                            ELSE concat("INFO", ';')
 9287                        END,
 9288                        CASE
 9289                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
 9290                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
 9291                            THEN concat(
 9292                                    '{genotypeconcordance_tag}=',
 9293                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
 9294                                )
 9295                            ELSE ''
 9296                        END
 9297                    )
 9298                FROM dataframe_genotypeconcordance
 9299                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
 9300            """
 9301            self.conn.execute(sql_update)
 9302
 9303            # Remove added columns
 9304            for added_column in added_columns:
 9305                self.drop_column(column=added_column)
 9306
 9307            # Delete dataframe
 9308            del dataframe_genotypeconcordance
 9309            gc.collect()
 9310
 9311    def calculation_barcode(self, tag: str = "barcode") -> None:
 9312        """
 9313        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
 9314        updates the INFO field in the file with the calculated barcode values.
 9315
 9316        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
 9317        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
 9318        the default tag name is set to "barcode", defaults to barcode
 9319        :type tag: str (optional)
 9320        """
 9321
 9322        # if FORMAT and samples
 9323        if (
 9324            "FORMAT" in self.get_header_columns_as_list()
 9325            and self.get_header_sample_list()
 9326        ):
 9327
 9328            # barcode annotation field
 9329            if not tag:
 9330                tag = "barcode"
 9331
 9332            # VCF infos tags
 9333            vcf_infos_tags = {
 9334                tag: "barcode calculation (VaRank)",
 9335            }
 9336
 9337            # Prefix
 9338            prefix = self.get_explode_infos_prefix()
 9339
 9340            # Field
 9341            barcode_infos = prefix + tag
 9342
 9343            # Variants table
 9344            table_variants = self.get_table_variants()
 9345
 9346            # Header
 9347            vcf_reader = self.get_header()
 9348
 9349            # Create variant id
 9350            variant_id_column = self.get_variant_id_column()
 9351            added_columns = [variant_id_column]
 9352
 9353            # variant_id, FORMAT and samples
 9354            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9355                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9356            )
 9357
 9358            # Create dataframe
 9359            dataframe_barcode = self.get_query_to_df(
 9360                f""" SELECT {samples_fields} FROM {table_variants} """
 9361            )
 9362
 9363            # Create barcode column
 9364            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9365                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
 9366            )
 9367
 9368            # Add barcode to header
 9369            vcf_reader.infos[tag] = vcf.parser._Info(
 9370                tag,
 9371                ".",
 9372                "String",
 9373                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
 9374                "howard calculation",
 9375                "0",
 9376                self.code_type_map.get("String"),
 9377            )
 9378
 9379            # Update
 9380            sql_update = f"""
 9381                UPDATE {table_variants}
 9382                SET "INFO" = 
 9383                    concat(
 9384                        CASE
 9385                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9386                            THEN ''
 9387                            ELSE concat("INFO", ';')
 9388                        END,
 9389                        CASE
 9390                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
 9391                            AND dataframe_barcode."{barcode_infos}" NOT NULL
 9392                            THEN concat(
 9393                                    '{tag}=',
 9394                                    dataframe_barcode."{barcode_infos}"
 9395                                )
 9396                            ELSE ''
 9397                        END
 9398                    )
 9399                FROM dataframe_barcode
 9400                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9401            """
 9402            self.conn.execute(sql_update)
 9403
 9404            # Remove added columns
 9405            for added_column in added_columns:
 9406                self.drop_column(column=added_column)
 9407
 9408            # Delete dataframe
 9409            del dataframe_barcode
 9410            gc.collect()
 9411
 9412    def calculation_barcode_family(self, tag: str = "BCF") -> None:
 9413        """
 9414        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
 9415        and updates the INFO field in the file with the calculated barcode values.
 9416
 9417        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
 9418        the barcode tag that will be added to the VCF file during the calculation process. If no value
 9419        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
 9420        :type tag: str (optional)
 9421        """
 9422
 9423        # if FORMAT and samples
 9424        if (
 9425            "FORMAT" in self.get_header_columns_as_list()
 9426            and self.get_header_sample_list()
 9427        ):
 9428
 9429            # barcode annotation field
 9430            if not tag:
 9431                tag = "BCF"
 9432
 9433            # VCF infos tags
 9434            vcf_infos_tags = {
 9435                tag: "barcode family calculation",
 9436                f"{tag}S": "barcode family samples",
 9437            }
 9438
 9439            # Param
 9440            param = self.get_param()
 9441            log.debug(f"param={param}")
 9442
 9443            # Prefix
 9444            prefix = self.get_explode_infos_prefix()
 9445
 9446            # PED param
 9447            ped = (
 9448                param.get("calculation", {})
 9449                .get("calculations", {})
 9450                .get("BARCODEFAMILY", {})
 9451                .get("family_pedigree", None)
 9452            )
 9453            log.debug(f"ped={ped}")
 9454
 9455            # Load PED
 9456            if ped:
 9457
 9458                # Pedigree is a file
 9459                if isinstance(ped, str) and os.path.exists(full_path(ped)):
 9460                    log.debug("Pedigree is file")
 9461                    with open(full_path(ped)) as ped:
 9462                        ped = yaml.safe_load(ped)
 9463
 9464                # Pedigree is a string
 9465                elif isinstance(ped, str):
 9466                    log.debug("Pedigree is str")
 9467                    try:
 9468                        ped = json.loads(ped)
 9469                        log.debug("Pedigree is json str")
 9470                    except ValueError as e:
 9471                        ped_samples = ped.split(",")
 9472                        ped = {}
 9473                        for ped_sample in ped_samples:
 9474                            ped[ped_sample] = ped_sample
 9475
 9476                # Pedigree is a dict
 9477                elif isinstance(ped, dict):
 9478                    log.debug("Pedigree is dict")
 9479
 9480                # Pedigree is not well formatted
 9481                else:
 9482                    msg_error = "Pedigree not well formatted"
 9483                    log.error(msg_error)
 9484                    raise ValueError(msg_error)
 9485
 9486                # Construct list
 9487                ped_samples = list(ped.values())
 9488
 9489            else:
 9490                log.debug("Pedigree not defined. Take all samples")
 9491                ped_samples = self.get_header_sample_list()
 9492                ped = {}
 9493                for ped_sample in ped_samples:
 9494                    ped[ped_sample] = ped_sample
 9495
 9496            # Check pedigree
 9497            if not ped or len(ped) == 0:
 9498                msg_error = f"Error in pedigree: samples {ped_samples}"
 9499                log.error(msg_error)
 9500                raise ValueError(msg_error)
 9501
 9502            # Log
 9503            log.info(
 9504                "Calculation 'BARCODEFAMILY' - Samples: "
 9505                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
 9506            )
 9507            log.debug(f"ped_samples={ped_samples}")
 9508
 9509            # Field
 9510            barcode_infos = prefix + tag
 9511
 9512            # Variants table
 9513            table_variants = self.get_table_variants()
 9514
 9515            # Header
 9516            vcf_reader = self.get_header()
 9517
 9518            # Create variant id
 9519            variant_id_column = self.get_variant_id_column()
 9520            added_columns = [variant_id_column]
 9521
 9522            # variant_id, FORMAT and samples
 9523            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9524                [f""" "{sample}" """ for sample in ped_samples]
 9525            )
 9526
 9527            # Create dataframe
 9528            dataframe_barcode = self.get_query_to_df(
 9529                f""" SELECT {samples_fields} FROM {table_variants} """
 9530            )
 9531
 9532            # Create barcode column
 9533            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9534                lambda row: barcode(row, samples=ped_samples), axis=1
 9535            )
 9536
 9537            # Add barcode family to header
 9538            # Add vaf_normalization to header
 9539            vcf_reader.formats[tag] = vcf.parser._Format(
 9540                id=tag,
 9541                num=".",
 9542                type="String",
 9543                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
 9544                type_code=self.code_type_map.get("String"),
 9545            )
 9546            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
 9547                id=f"{tag}S",
 9548                num=".",
 9549                type="String",
 9550                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
 9551                type_code=self.code_type_map.get("String"),
 9552            )
 9553
 9554            # Update
 9555            # for sample in ped_samples:
 9556            sql_update_set = []
 9557            for sample in self.get_header_sample_list() + ["FORMAT"]:
 9558                if sample in ped_samples:
 9559                    value = f'dataframe_barcode."{barcode_infos}"'
 9560                    value_samples = (
 9561                        "'"
 9562                        + ",".join([f""" "{sample}" """ for sample in ped_samples])
 9563                        + "'"
 9564                    )
 9565                    ped_samples
 9566                elif sample == "FORMAT":
 9567                    value = f"'{tag}'"
 9568                    value_samples = f"'{tag}S'"
 9569                else:
 9570                    value = "'.'"
 9571                    value_samples = "'.'"
 9572                format_regex = r"[a-zA-Z0-9\s]"
 9573                sql_update_set.append(
 9574                    f"""
 9575                        "{sample}" = 
 9576                        concat(
 9577                            CASE
 9578                                WHEN {table_variants}."{sample}" = './.'
 9579                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
 9580                                ELSE {table_variants}."{sample}"
 9581                            END,
 9582                            ':',
 9583                            {value},
 9584                            ':',
 9585                            {value_samples}
 9586                        )
 9587                    """
 9588                )
 9589
 9590            sql_update_set_join = ", ".join(sql_update_set)
 9591            sql_update = f"""
 9592                UPDATE {table_variants}
 9593                SET {sql_update_set_join}
 9594                FROM dataframe_barcode
 9595                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9596            """
 9597            self.conn.execute(sql_update)
 9598
 9599            # Remove added columns
 9600            for added_column in added_columns:
 9601                self.drop_column(column=added_column)
 9602
 9603            # Delete dataframe
 9604            del dataframe_barcode
 9605            gc.collect()
 9606
 9607    def calculation_trio(self) -> None:
 9608        """
 9609        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
 9610        information to the INFO field of each variant.
 9611        """
 9612
 9613        # if FORMAT and samples
 9614        if (
 9615            "FORMAT" in self.get_header_columns_as_list()
 9616            and self.get_header_sample_list()
 9617        ):
 9618
 9619            # trio annotation field
 9620            trio_tag = "trio"
 9621
 9622            # VCF infos tags
 9623            vcf_infos_tags = {
 9624                "trio": "trio calculation",
 9625            }
 9626
 9627            # Param
 9628            param = self.get_param()
 9629
 9630            # Prefix
 9631            prefix = self.get_explode_infos_prefix()
 9632
 9633            # Trio param
 9634            trio_ped = (
 9635                param.get("calculation", {})
 9636                .get("calculations", {})
 9637                .get("TRIO", {})
 9638                .get("trio_pedigree", None)
 9639            )
 9640
 9641            # Load trio
 9642            if trio_ped:
 9643
 9644                # Trio pedigree is a file
 9645                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
 9646                    log.debug("TRIO pedigree is file")
 9647                    with open(full_path(trio_ped)) as trio_ped:
 9648                        trio_ped = yaml.safe_load(trio_ped)
 9649
 9650                # Trio pedigree is a string
 9651                elif isinstance(trio_ped, str):
 9652                    log.debug("TRIO pedigree is str")
 9653                    try:
 9654                        trio_ped = json.loads(trio_ped)
 9655                        log.debug("TRIO pedigree is json str")
 9656                    except ValueError as e:
 9657                        trio_samples = trio_ped.split(",")
 9658                        if len(trio_samples) == 3:
 9659                            trio_ped = {
 9660                                "father": trio_samples[0],
 9661                                "mother": trio_samples[1],
 9662                                "child": trio_samples[2],
 9663                            }
 9664                            log.debug("TRIO pedigree is list str")
 9665                        else:
 9666                            msg_error = "TRIO pedigree not well formatted"
 9667                            log.error(msg_error)
 9668                            raise ValueError(msg_error)
 9669
 9670                # Trio pedigree is a dict
 9671                elif isinstance(trio_ped, dict):
 9672                    log.debug("TRIO pedigree is dict")
 9673
 9674                # Trio pedigree is not well formatted
 9675                else:
 9676                    msg_error = "TRIO pedigree not well formatted"
 9677                    log.error(msg_error)
 9678                    raise ValueError(msg_error)
 9679
 9680                # Construct trio list
 9681                trio_samples = [
 9682                    trio_ped.get("father", ""),
 9683                    trio_ped.get("mother", ""),
 9684                    trio_ped.get("child", ""),
 9685                ]
 9686
 9687            else:
 9688                log.debug("TRIO pedigree not defined. Take the first 3 samples")
 9689                samples_list = self.get_header_sample_list()
 9690                if len(samples_list) >= 3:
 9691                    trio_samples = self.get_header_sample_list()[0:3]
 9692                    trio_ped = {
 9693                        "father": trio_samples[0],
 9694                        "mother": trio_samples[1],
 9695                        "child": trio_samples[2],
 9696                    }
 9697                else:
 9698                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
 9699                    log.error(msg_error)
 9700                    raise ValueError(msg_error)
 9701
 9702            # Check trio pedigree
 9703            if not trio_ped or len(trio_ped) != 3:
 9704                msg_error = f"Error in TRIO pedigree: {trio_ped}"
 9705                log.error(msg_error)
 9706                raise ValueError(msg_error)
 9707
 9708            # Log
 9709            log.info(
 9710                f"Calculation 'TRIO' - Samples: "
 9711                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
 9712            )
 9713
 9714            # Field
 9715            trio_infos = prefix + trio_tag
 9716
 9717            # Variants table
 9718            table_variants = self.get_table_variants()
 9719
 9720            # Header
 9721            vcf_reader = self.get_header()
 9722
 9723            # Create variant id
 9724            variant_id_column = self.get_variant_id_column()
 9725            added_columns = [variant_id_column]
 9726
 9727            # variant_id, FORMAT and samples
 9728            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9729                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9730            )
 9731
 9732            # Create dataframe
 9733            dataframe_trio = self.get_query_to_df(
 9734                f""" SELECT {samples_fields} FROM {table_variants} """
 9735            )
 9736
 9737            # Create trio column
 9738            dataframe_trio[trio_infos] = dataframe_trio.apply(
 9739                lambda row: trio(row, samples=trio_samples), axis=1
 9740            )
 9741
 9742            # Add trio to header
 9743            vcf_reader.infos[trio_tag] = vcf.parser._Info(
 9744                trio_tag,
 9745                ".",
 9746                "String",
 9747                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
 9748                "howard calculation",
 9749                "0",
 9750                self.code_type_map.get("String"),
 9751            )
 9752
 9753            # Update
 9754            sql_update = f"""
 9755                UPDATE {table_variants}
 9756                SET "INFO" = 
 9757                    concat(
 9758                        CASE
 9759                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9760                            THEN ''
 9761                            ELSE concat("INFO", ';')
 9762                        END,
 9763                        CASE
 9764                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
 9765                             AND dataframe_trio."{trio_infos}" NOT NULL
 9766                            THEN concat(
 9767                                    '{trio_tag}=',
 9768                                    dataframe_trio."{trio_infos}"
 9769                                )
 9770                            ELSE ''
 9771                        END
 9772                    )
 9773                FROM dataframe_trio
 9774                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
 9775            """
 9776            self.conn.execute(sql_update)
 9777
 9778            # Remove added columns
 9779            for added_column in added_columns:
 9780                self.drop_column(column=added_column)
 9781
 9782            # Delete dataframe
 9783            del dataframe_trio
 9784            gc.collect()
 9785
 9786    def calculation_vaf_normalization(self) -> None:
 9787        """
 9788        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
 9789        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
 9790        :return: The function does not return anything.
 9791        """
 9792
 9793        # if FORMAT and samples
 9794        if (
 9795            "FORMAT" in self.get_header_columns_as_list()
 9796            and self.get_header_sample_list()
 9797        ):
 9798
 9799            # vaf_normalization annotation field
 9800            vaf_normalization_tag = "VAF"
 9801
 9802            # VCF infos tags
 9803            vcf_infos_tags = {
 9804                "VAF": "VAF Variant Frequency",
 9805            }
 9806
 9807            # Prefix
 9808            prefix = self.get_explode_infos_prefix()
 9809
 9810            # Variants table
 9811            table_variants = self.get_table_variants()
 9812
 9813            # Header
 9814            vcf_reader = self.get_header()
 9815
 9816            # Do not calculate if VAF already exists
 9817            if "VAF" in vcf_reader.formats:
 9818                log.debug("VAF already on genotypes")
 9819                return
 9820
 9821            # Create variant id
 9822            variant_id_column = self.get_variant_id_column()
 9823            added_columns = [variant_id_column]
 9824
 9825            # variant_id, FORMAT and samples
 9826            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9827                f""" "{sample}" """ for sample in self.get_header_sample_list()
 9828            )
 9829
 9830            # Create dataframe
 9831            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
 9832            log.debug(f"query={query}")
 9833            dataframe_vaf_normalization = self.get_query_to_df(query=query)
 9834
 9835            vaf_normalization_set = []
 9836
 9837            # for each sample vaf_normalization
 9838            for sample in self.get_header_sample_list():
 9839                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
 9840                    lambda row: vaf_normalization(row, sample=sample), axis=1
 9841                )
 9842                vaf_normalization_set.append(
 9843                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
 9844                )
 9845
 9846            # Add VAF to FORMAT
 9847            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
 9848                "FORMAT"
 9849            ].apply(lambda x: str(x) + ":VAF")
 9850            vaf_normalization_set.append(
 9851                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
 9852            )
 9853
 9854            # Add vaf_normalization to header
 9855            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
 9856                id=vaf_normalization_tag,
 9857                num="1",
 9858                type="Float",
 9859                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
 9860                type_code=self.code_type_map.get("Float"),
 9861            )
 9862
 9863            # Create fields to add in INFO
 9864            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
 9865
 9866            # Update
 9867            sql_update = f"""
 9868                UPDATE {table_variants}
 9869                SET {sql_vaf_normalization_set}
 9870                FROM dataframe_vaf_normalization
 9871                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
 9872
 9873            """
 9874            self.conn.execute(sql_update)
 9875
 9876            # Remove added columns
 9877            for added_column in added_columns:
 9878                self.drop_column(column=added_column)
 9879
 9880            # Delete dataframe
 9881            del dataframe_vaf_normalization
 9882            gc.collect()
 9883
 9884    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9885        """
 9886        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9887        field in a VCF file and updates the INFO column of the variants table with the calculated
 9888        statistics.
 9889
 9890        :param info: The `info` parameter is a string that represents the type of information for which
 9891        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9892        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9893        maximum value, the mean, the median, defaults to VAF
 9894        :type info: str (optional)
 9895        """
 9896
 9897        # if FORMAT and samples
 9898        if (
 9899            "FORMAT" in self.get_header_columns_as_list()
 9900            and self.get_header_sample_list()
 9901        ):
 9902
 9903            # vaf_stats annotation field
 9904            vaf_stats_tag = info + "_stats"
 9905
 9906            # VCF infos tags
 9907            vcf_infos_tags = {
 9908                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9909                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9910                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9911                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9912                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9913                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9914                info
 9915                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9916            }
 9917
 9918            # Prefix
 9919            prefix = self.get_explode_infos_prefix()
 9920
 9921            # Field
 9922            vaf_stats_infos = prefix + vaf_stats_tag
 9923
 9924            # Variants table
 9925            table_variants = self.get_table_variants()
 9926
 9927            # Header
 9928            vcf_reader = self.get_header()
 9929
 9930            # Create variant id
 9931            variant_id_column = self.get_variant_id_column()
 9932            added_columns = [variant_id_column]
 9933
 9934            # variant_id, FORMAT and samples
 9935            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9936                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9937            )
 9938
 9939            # Create dataframe
 9940            dataframe_vaf_stats = self.get_query_to_df(
 9941                f""" SELECT {samples_fields} FROM {table_variants} """
 9942            )
 9943
 9944            # Create vaf_stats column
 9945            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
 9946                lambda row: genotype_stats(
 9947                    row, samples=self.get_header_sample_list(), info=info
 9948                ),
 9949                axis=1,
 9950            )
 9951
 9952            # List of vcf tags
 9953            sql_vaf_stats_fields = []
 9954
 9955            # Check all VAF stats infos
 9956            for stat in vcf_infos_tags:
 9957
 9958                # Extract stats
 9959                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
 9960                    lambda x: dict(x).get(stat, "")
 9961                )
 9962
 9963                # Add snpeff_hgvs to header
 9964                vcf_reader.infos[stat] = vcf.parser._Info(
 9965                    stat,
 9966                    ".",
 9967                    "String",
 9968                    vcf_infos_tags.get(stat, "genotype statistics"),
 9969                    "howard calculation",
 9970                    "0",
 9971                    self.code_type_map.get("String"),
 9972                )
 9973
 9974                if len(sql_vaf_stats_fields):
 9975                    sep = ";"
 9976                else:
 9977                    sep = ""
 9978
 9979                # Create fields to add in INFO
 9980                sql_vaf_stats_fields.append(
 9981                    f"""
 9982                        CASE
 9983                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
 9984                            THEN concat(
 9985                                    '{sep}{stat}=',
 9986                                    dataframe_vaf_stats."{stat}"
 9987                                )
 9988                            ELSE ''
 9989                        END
 9990                    """
 9991                )
 9992
 9993            # SQL set for update
 9994            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
 9995
 9996            # Update
 9997            sql_update = f"""
 9998                UPDATE {table_variants}
 9999                SET "INFO" = 
10000                    concat(
10001                        CASE
10002                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10003                            THEN ''
10004                            ELSE concat("INFO", ';')
10005                        END,
10006                        {sql_vaf_stats_fields_set}
10007                    )
10008                FROM dataframe_vaf_stats
10009                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
10010
10011            """
10012            self.conn.execute(sql_update)
10013
10014            # Remove added columns
10015            for added_column in added_columns:
10016                self.drop_column(column=added_column)
10017
10018            # Delete dataframe
10019            del dataframe_vaf_stats
10020            gc.collect()
10021
10022    def calculation_transcripts_annotation(
10023        self, info_json: str = None, info_format: str = None
10024    ) -> None:
10025        """
10026        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
10027        field to it if transcripts are available.
10028
10029        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
10030        is a string parameter that represents the information field to be used in the transcripts JSON.
10031        It is used to specify the JSON format for the transcripts information. If no value is provided
10032        when calling the method, it defaults to "
10033        :type info_json: str
10034        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
10035        method is a string parameter that specifies the format of the information field to be used in
10036        the transcripts JSON. It is used to define the format of the information field
10037        :type info_format: str
10038        """
10039
10040        # Create transcripts table
10041        transcripts_table = self.create_transcript_view()
10042
10043        # Add info field
10044        if transcripts_table:
10045            self.transcript_view_to_variants(
10046                transcripts_table=transcripts_table,
10047                transcripts_info_field_json=info_json,
10048                transcripts_info_field_format=info_format,
10049            )
10050        else:
10051            log.info("No Transcripts to process. Check param.json file configuration")
10052
10053    def calculation_transcripts_prioritization(self) -> None:
10054        """
10055        The function `calculation_transcripts_prioritization` creates a transcripts table and
10056        prioritizes transcripts based on certain criteria.
10057        """
10058
10059        # Create transcripts table
10060        transcripts_table = self.create_transcript_view()
10061
10062        # Add info field
10063        if transcripts_table:
10064            self.transcripts_prioritization(transcripts_table=transcripts_table)
10065        else:
10066            log.info("No Transcripts to process. Check param.json file configuration")
10067
10068    def calculation_transcripts_export(self) -> None:
10069        """ """
10070
10071        # Create transcripts table
10072        transcripts_table = self.create_transcript_view()
10073
10074        # Add info field
10075        if transcripts_table:
10076            self.transcripts_export(transcripts_table=transcripts_table)
10077        else:
10078            log.info("No Transcripts to process. Check param.json file configuration")
10079
10080    ###############
10081    # Transcripts #
10082    ###############
10083
10084    def transcripts_export(
10085        self, transcripts_table: str = None, param: dict = {}
10086    ) -> bool:
10087        """ """
10088
10089        log.debug("Start transcripts export...")
10090
10091        # Param
10092        if not param:
10093            param = self.get_param()
10094
10095        # Param export
10096        param_transcript_export = param.get("transcripts", {}).get("export", {})
10097
10098        # Output file
10099        transcripts_export_output = param_transcript_export.get("output", None)
10100
10101        if not param_transcript_export or not transcripts_export_output:
10102            log.warning(f"No transcriipts export parameters defined!")
10103            return False
10104
10105        # List of transcripts annotations
10106        query_describe = f"""
10107            SELECT column_name
10108            FROM (
10109                    DESCRIBE SELECT * FROM {transcripts_table}
10110                )
10111            WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO')
10112        """
10113        transcripts_annotations_list = list(
10114            self.get_query_to_df(query=query_describe)["column_name"]
10115        )
10116
10117        # Create transcripts table for export
10118        transcripts_table_export = f"{transcripts_table}_export_" + "".join(
10119            random.choices(string.ascii_uppercase + string.digits, k=10)
10120        )
10121        query_create_transcripts_table_export = f"""
10122            CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table})
10123        """
10124        self.execute_query(query=query_create_transcripts_table_export)
10125
10126        # Output file format
10127        transcripts_export_output_format = get_file_format(
10128            filename=transcripts_export_output
10129        )
10130
10131        # Format VCF - construct INFO
10132        if transcripts_export_output_format in ["vcf"]:
10133
10134            # Construct query update INFO and header
10135            query_update_info = []
10136            for field in transcripts_annotations_list:
10137
10138                # If field not in header
10139                if field not in self.get_header_infos_list():
10140
10141                    # Add PZ Transcript in header
10142                    self.get_header().infos[field] = vcf.parser._Info(
10143                        field,
10144                        ".",
10145                        "String",
10146                        f"Annotation '{field}' from transcript view",
10147                        "unknown",
10148                        "unknown",
10149                        0,
10150                    )
10151
10152                # Add field as INFO/tag
10153                query_update_info.append(
10154                    f"""
10155                        CASE
10156                            WHEN "{field}" IS NOT NULL
10157                            THEN concat('{field}=', "{field}", ';')    
10158                            ELSE ''     
10159                        END
10160                        """
10161                )
10162
10163            # Query param
10164            query_update_info_value = (
10165                f""" concat('',  {", ".join(query_update_info)}) """
10166            )
10167            query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """
10168
10169        else:
10170
10171            # Query param
10172            query_update_info_value = f""" NULL """
10173            query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """
10174
10175        # Update query INFO column
10176        query_update = f"""
10177            UPDATE {transcripts_table_export}
10178            SET INFO = {query_update_info_value}
10179
10180        """
10181        self.execute_query(query=query_update)
10182
10183        # Export
10184        self.export_output(
10185            output_file=transcripts_export_output,
10186            query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """,
10187        )
10188
10189        # Drop transcripts export table
10190        query_drop_transcripts_table_export = f"""
10191            DROP TABLE {transcripts_table_export}
10192        """
10193        self.execute_query(query=query_drop_transcripts_table_export)
10194
10195    def transcripts_prioritization(
10196        self, transcripts_table: str = None, param: dict = {}
10197    ) -> bool:
10198        """
10199        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
10200        and updates the variants table with the prioritized information.
10201
10202        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10203        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
10204        This parameter is used to identify the table where the transcripts data is stored for the
10205        prioritization process
10206        :type transcripts_table: str
10207        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
10208        that contains various configuration settings for the prioritization process of transcripts. It
10209        is used to customize the behavior of the prioritization algorithm and includes settings such as
10210        the prefix for prioritization fields, default profiles, and other
10211        :type param: dict
10212        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
10213        transcripts prioritization process is successfully completed, and `False` if there are any
10214        issues or if no profile is defined for transcripts prioritization.
10215        """
10216
10217        log.debug("Start transcripts prioritization...")
10218
10219        # Param
10220        if not param:
10221            param = self.get_param()
10222
10223        # Variants table
10224        table_variants = self.get_table_variants()
10225
10226        # Transcripts table
10227        if transcripts_table is None:
10228            transcripts_table = self.create_transcript_view(
10229                transcripts_table="transcripts", param=param
10230            )
10231        if transcripts_table is None:
10232            msg_err = "No Transcripts table availalble"
10233            log.error(msg_err)
10234            raise ValueError(msg_err)
10235        log.debug(f"transcripts_table={transcripts_table}")
10236
10237        # Get transcripts columns
10238        columns_as_list_query = f"""
10239            DESCRIBE {transcripts_table}
10240        """
10241        columns_as_list = list(
10242            self.get_query_to_df(columns_as_list_query)["column_name"]
10243        )
10244
10245        # Create INFO if not exists
10246        if "INFO" not in columns_as_list:
10247            query_add_info = f"""
10248                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
10249            """
10250            self.execute_query(query_add_info)
10251
10252        # Prioritization param and Force only PZ Score and Flag
10253        pz_param = param.get("transcripts", {}).get("prioritization", {})
10254
10255        # PZ profile by default
10256        pz_profile_default = (
10257            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
10258        )
10259
10260        # Exit if no profile
10261        if pz_profile_default is None:
10262            log.warning("No profile defined for transcripts prioritization")
10263            return False
10264
10265        # PZ fields
10266        pz_param_pzfields = {}
10267
10268        # PZ field transcripts
10269        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
10270
10271        # Add PZ Transcript in header
10272        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
10273            pz_fields_transcripts,
10274            ".",
10275            "String",
10276            f"Transcript selected from prioritization process, profile {pz_profile_default}",
10277            "unknown",
10278            "unknown",
10279            code_type_map["String"],
10280        )
10281
10282        # Mandatory fields
10283        pz_mandatory_fields_list = [
10284            "Score",
10285            "Flag",
10286            "Tags",
10287            "Comment",
10288            "Infos",
10289            "Class",
10290        ]
10291        pz_mandatory_fields = []
10292        for pz_mandatory_field in pz_mandatory_fields_list:
10293            pz_mandatory_fields.append(
10294                pz_param.get("pzprefix", "PTZ") + pz_mandatory_field
10295            )
10296
10297        # PZ fields in param
10298        for pz_field in pz_param.get("pzfields", []):
10299            if pz_field in pz_mandatory_fields_list:
10300                pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = (
10301                    pz_param.get("pzprefix", "PTZ") + pz_field
10302                )
10303            else:
10304                pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field
10305                pz_param_pzfields[pz_field] = pz_field_new
10306
10307                # Add PZ Transcript in header
10308                self.get_header().infos[pz_field_new] = vcf.parser._Info(
10309                    pz_field_new,
10310                    ".",
10311                    "String",
10312                    f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}",
10313                    "unknown",
10314                    "unknown",
10315                    code_type_map["String"],
10316                )
10317
10318        # PZ fields param
10319        pz_param["pzfields"] = pz_mandatory_fields
10320
10321        # Prioritization
10322        prioritization_result = self.prioritization(
10323            table=transcripts_table,
10324            pz_param=param.get("transcripts", {}).get("prioritization", {}),
10325        )
10326        if not prioritization_result:
10327            log.warning("Transcripts prioritization not processed")
10328            return False
10329
10330        # PZ fields sql query
10331        query_update_select_list = []
10332        query_update_concat_list = []
10333        query_update_order_list = []
10334        for pz_param_pzfield in set(
10335            list(pz_param_pzfields.keys()) + pz_mandatory_fields
10336        ):
10337            query_update_select_list.append(f" {pz_param_pzfield}, ")
10338
10339        for pz_param_pzfield in pz_param_pzfields:
10340            query_update_concat_list.append(
10341                f"""
10342                    , CASE 
10343                        WHEN {pz_param_pzfield} IS NOT NULL
10344                        THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield})
10345                        ELSE ''
10346                    END
10347                """
10348            )
10349
10350        # Order by
10351        pz_orders = (
10352            param.get("transcripts", {})
10353            .get("prioritization", {})
10354            .get("prioritization_transcripts_order", {})
10355        )
10356        if not pz_orders:
10357            pz_orders = {
10358                pz_param.get("pzprefix", "PTZ") + "Flag": "DESC",
10359                pz_param.get("pzprefix", "PTZ") + "Score": "DESC",
10360            }
10361        for pz_order in pz_orders:
10362            query_update_order_list.append(
10363                f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """
10364            )
10365
10366        # Fields to explode
10367        fields_to_explode = (
10368            list(pz_param_pzfields.keys())
10369            + pz_mandatory_fields
10370            + list(pz_orders.keys())
10371        )
10372        # Remove transcript column as a specific transcript column
10373        if "transcript" in fields_to_explode:
10374            fields_to_explode.remove("transcript")
10375
10376        # Fields intranscripts table
10377        query_transcripts_table = f"""
10378            DESCRIBE SELECT * FROM {transcripts_table}
10379        """
10380        query_transcripts_table = self.get_query_to_df(query=query_transcripts_table)
10381
10382        # Check fields to explode
10383        for field_to_explode in fields_to_explode:
10384            if field_to_explode not in self.get_header_infos_list() + list(
10385                query_transcripts_table.column_name
10386            ):
10387                msg_err = f"INFO/{field_to_explode} NOT IN header"
10388                log.error(msg_err)
10389                raise ValueError(msg_err)
10390
10391        # Explode fields to explode
10392        self.explode_infos(
10393            table=transcripts_table,
10394            fields=fields_to_explode,
10395        )
10396
10397        # Transcript preference file
10398        transcripts_preference_file = (
10399            param.get("transcripts", {})
10400            .get("prioritization", {})
10401            .get("prioritization_transcripts", {})
10402        )
10403        transcripts_preference_file = full_path(transcripts_preference_file)
10404
10405        # Transcript preference forced
10406        transcript_preference_force = (
10407            param.get("transcripts", {})
10408            .get("prioritization", {})
10409            .get("prioritization_transcripts_force", False)
10410        )
10411        # Transcript version forced
10412        transcript_version_force = (
10413            param.get("transcripts", {})
10414            .get("prioritization", {})
10415            .get("prioritization_transcripts_version_force", False)
10416        )
10417
10418        # Transcripts Ranking
10419        if transcripts_preference_file:
10420
10421            # Transcripts file to dataframe
10422            if os.path.exists(transcripts_preference_file):
10423                transcripts_preference_dataframe = transcripts_file_to_df(
10424                    transcripts_preference_file
10425                )
10426            else:
10427                log.error(
10428                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10429                )
10430                raise ValueError(
10431                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10432                )
10433
10434            # Order by depending to transcript preference forcing
10435            if transcript_preference_force:
10436                order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """
10437            else:
10438                order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """
10439
10440            # Transcript columns joined depend on version consideration
10441            if transcript_version_force:
10442                transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """
10443            else:
10444                transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """
10445
10446            # Query ranking for update
10447            query_update_ranking = f"""
10448                SELECT
10449                    "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)}
10450                    ROW_NUMBER() OVER (
10451                        PARTITION BY "#CHROM", POS, REF, ALT
10452                        ORDER BY {order_by}
10453                    ) AS rn
10454                FROM {transcripts_table}
10455                LEFT JOIN 
10456                    (
10457                        SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order
10458                        FROM transcripts_preference_dataframe
10459                    ) AS transcripts_preference
10460                ON {transcripts_version_join}
10461            """
10462
10463        else:
10464
10465            # Query ranking for update
10466            query_update_ranking = f"""
10467                SELECT
10468                    "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)}
10469                    ROW_NUMBER() OVER (
10470                        PARTITION BY "#CHROM", POS, REF, ALT
10471                        ORDER BY {" , ".join(query_update_order_list)}
10472                    ) AS rn
10473                FROM {transcripts_table}
10474            """
10475
10476        # Export Transcripts prioritization infos to variants table
10477        query_update = f"""
10478            WITH RankedTranscripts AS (
10479                {query_update_ranking}
10480            )
10481            UPDATE {table_variants}
10482                SET
10483                INFO = CONCAT(CASE
10484                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10485                            THEN ''
10486                            ELSE concat("INFO", ';')
10487                        END,
10488                        concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)})
10489                        )
10490            FROM
10491                RankedTranscripts
10492            WHERE
10493                rn = 1
10494                AND variants."#CHROM" = RankedTranscripts."#CHROM"
10495                AND variants."POS" = RankedTranscripts."POS"
10496                AND variants."REF" = RankedTranscripts."REF"
10497                AND variants."ALT" = RankedTranscripts."ALT"     
10498        """
10499
10500        # log.debug(f"query_update={query_update}")
10501        self.execute_query(query=query_update)
10502
10503        # Return
10504        return True
10505
10506    def create_transcript_view_from_columns_map(
10507        self,
10508        transcripts_table: str = "transcripts",
10509        columns_maps: dict = {},
10510        added_columns: list = [],
10511        temporary_tables: list = None,
10512        annotation_fields: list = None,
10513        column_rename: dict = {},
10514        column_clean: bool = False,
10515        column_case: str = None,
10516    ) -> tuple[list, list, list]:
10517        """
10518        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
10519        specified columns mapping for transcripts data.
10520
10521        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10522        of the table where the transcripts data is stored or will be stored in the database. This table
10523        typically contains information about transcripts such as Ensembl transcript IDs, gene names,
10524        scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
10525        :type transcripts_table: str (optional)
10526        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information
10527        about how to map columns from a transcripts table to create a view. Each entry in the
10528        `columns_maps` list represents a mapping configuration for a specific set of columns. It
10529        typically includes details such as the main transcript column and additional information columns
10530        :type columns_maps: dict
10531        :param added_columns: The `added_columns` parameter in the
10532        `create_transcript_view_from_columns_map` function is a list that stores the additional columns
10533        that will be added to the view being created based on the columns map provided. These columns
10534        are generated by exploding the transcript information columns along with the main transcript
10535        column
10536        :type added_columns: list
10537        :param temporary_tables: The `temporary_tables` parameter in the
10538        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
10539        tables created during the process of creating a transcript view from a columns map. These
10540        temporary tables are used to store intermediate results or transformations before the final view
10541        is generated
10542        :type temporary_tables: list
10543        :param annotation_fields: The `annotation_fields` parameter in the
10544        `create_transcript_view_from_columns_map` function is a list that stores the fields that are
10545        used for annotation in the query view creation process. These fields are extracted from the
10546        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
10547        :type annotation_fields: list
10548        :param column_rename: The `column_rename` parameter in the
10549        `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify
10550        custom renaming for columns during the creation of the temporary table view. This parameter
10551        provides a mapping of original column names to the desired renamed column names. By using this
10552        parameter,
10553        :type column_rename: dict
10554        :param column_clean: The `column_clean` parameter in the
10555        `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the
10556        column values should be cleaned or not. If set to `True`, the column values will be cleaned by
10557        removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to
10558        False
10559        :type column_clean: bool (optional)
10560        :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map`
10561        function is used to specify the case transformation to be applied to the columns during the view
10562        creation process. It allows you to control whether the column values should be converted to
10563        lowercase, uppercase, or remain unchanged
10564        :type column_case: str
10565        :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three
10566        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
10567        """
10568
10569        log.debug("Start transcrpts view creation from columns map...")
10570
10571        # "from_columns_map": [
10572        #     {
10573        #         "transcripts_column": "Ensembl_transcriptid",
10574        #         "transcripts_infos_columns": [
10575        #             "genename",
10576        #             "Ensembl_geneid",
10577        #             "LIST_S2_score",
10578        #             "LIST_S2_pred",
10579        #         ],
10580        #     },
10581        #     {
10582        #         "transcripts_column": "Ensembl_transcriptid",
10583        #         "transcripts_infos_columns": [
10584        #             "genename",
10585        #             "VARITY_R_score",
10586        #             "Aloft_pred",
10587        #         ],
10588        #     },
10589        # ],
10590
10591        # Init
10592        if temporary_tables is None:
10593            temporary_tables = []
10594        if annotation_fields is None:
10595            annotation_fields = []
10596
10597        # Variants table
10598        table_variants = self.get_table_variants()
10599
10600        for columns_map in columns_maps:
10601
10602            # Log
10603            log.debug(f"columns_map={columns_map}")
10604
10605            # Transcript column
10606            transcripts_column = columns_map.get("transcripts_column", None)
10607
10608            # Transcripts infos columns
10609            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
10610
10611            # Transcripts infos columns rename
10612            column_rename = columns_map.get("column_rename", column_rename)
10613
10614            # Transcripts infos columns clean
10615            column_clean = columns_map.get("column_clean", column_clean)
10616
10617            # Transcripts infos columns case
10618            column_case = columns_map.get("column_case", column_case)
10619
10620            if transcripts_column is not None:
10621
10622                # Explode
10623                added_columns += self.explode_infos(
10624                    fields=[transcripts_column] + transcripts_infos_columns
10625                )
10626
10627                # View clauses
10628                clause_select_variants = []
10629                clause_select_tanscripts = []
10630                for field in [transcripts_column] + transcripts_infos_columns:
10631
10632                    # AS field
10633                    as_field = field
10634
10635                    # Rename
10636                    if column_rename:
10637                        as_field = column_rename.get(as_field, as_field)
10638
10639                    # Clean
10640                    if column_clean:
10641                        as_field = clean_annotation_field(as_field)
10642
10643                    # Case
10644                    if column_case:
10645                        if column_case.lower() in ["lower"]:
10646                            as_field = as_field.lower()
10647                        elif column_case.lower() in ["upper"]:
10648                            as_field = as_field.upper()
10649
10650                    # Clause select Variants
10651                    clause_select_variants.append(
10652                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10653                    )
10654
10655                    if field in [transcripts_column]:
10656                        clause_select_tanscripts.append(
10657                            f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10658                        )
10659                    else:
10660                        clause_select_tanscripts.append(
10661                            f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """
10662                        )
10663                        annotation_fields.append(as_field)
10664
10665                # Query View
10666                query = f""" 
10667                    SELECT
10668                        "#CHROM", POS, REF, ALT, INFO,
10669                        "{transcripts_column}" AS 'transcript',
10670                        {", ".join(clause_select_tanscripts)}
10671                    FROM (
10672                        SELECT 
10673                            "#CHROM", POS, REF, ALT, INFO,
10674                            {", ".join(clause_select_variants)}
10675                        FROM {table_variants}
10676                        )
10677                    WHERE "{transcripts_column}" IS NOT NULL
10678                """
10679
10680                # Create temporary table
10681                temporary_table = transcripts_table + "".join(
10682                    random.choices(string.ascii_uppercase + string.digits, k=10)
10683                )
10684
10685                # # Temporary_tables
10686                # temporary_tables.append(temporary_table)
10687                # query_view = f"""
10688                #     CREATE TEMPORARY TABLE {temporary_table}
10689                #     AS ({query})
10690                # """
10691                # self.execute_query(query=query_view)
10692
10693                # Temporary_tables
10694                temporary_tables.append(temporary_table)
10695
10696                # List of unique #CHROM
10697                query_unique_chrom = f"""
10698                    SELECT DISTINCT "#CHROM"
10699                    FROM variants
10700                """
10701                unique_chroms = self.get_query_to_df(query=query_unique_chrom)
10702
10703                # Create table with structure but without data
10704                query_create_table = f"""
10705                    CREATE TABLE {temporary_table}
10706                    AS ({query} LIMIT 0)
10707                """
10708                self.execute_query(query=query_create_table)
10709
10710                # Process by #CHROM
10711                for chrom in unique_chroms["#CHROM"]:
10712
10713                    # Log
10714                    log.debug(f"Processing #CHROM={chrom}")
10715
10716                    # Select data by #CHROM
10717                    query_chunk = f"""
10718                        SELECT *
10719                        FROM ({query})
10720                        WHERE "#CHROM" = '{chrom}'
10721                    """
10722
10723                    # Insert data
10724                    query_insert_chunk = f"""
10725                        INSERT INTO {temporary_table}
10726                        {query_chunk}
10727                    """
10728                    self.execute_query(query=query_insert_chunk)
10729
10730        return added_columns, temporary_tables, annotation_fields
10731
10732    def create_transcript_view_from_column_format(
10733        self,
10734        transcripts_table: str = "transcripts",
10735        column_formats: dict = {},
10736        temporary_tables: list = None,
10737        annotation_fields: list = None,
10738        column_rename: dict = {},
10739        column_clean: bool = False,
10740        column_case: str = None,
10741    ) -> tuple[list, list, list]:
10742        """
10743        The `create_transcript_view_from_column_format` function generates a transcript view based on
10744        specified column formats, adds additional columns and annotation fields, and returns the list of
10745        temporary tables and annotation fields.
10746
10747        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10748        of the table containing the transcripts data. This table will be used as the base table for
10749        creating the transcript view. The default value for this parameter is "transcripts", but you can
10750        provide a different table name if needed, defaults to transcripts
10751        :type transcripts_table: str (optional)
10752        :param column_formats: The `column_formats` parameter is a dictionary that contains information
10753        about the columns to be used for creating the transcript view. Each entry in the dictionary
10754        specifies the mapping between a transcripts column and a transcripts infos column. This
10755        parameter allows you to define how the columns from the transcripts table should be transformed
10756        or mapped
10757        :type column_formats: dict
10758        :param temporary_tables: The `temporary_tables` parameter in the
10759        `create_transcript_view_from_column_format` function is a list that stores the names of
10760        temporary views created during the process of creating a transcript view from a column format.
10761        These temporary views are used to manipulate and extract data before generating the final
10762        transcript view
10763        :type temporary_tables: list
10764        :param annotation_fields: The `annotation_fields` parameter in the
10765        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
10766        that are extracted from the temporary views created during the process. These annotation fields
10767        are obtained by querying the temporary views and extracting the column names excluding specific
10768        columns like `#CH
10769        :type annotation_fields: list
10770        :param column_rename: The `column_rename` parameter in the
10771        `create_transcript_view_from_column_format` function is a dictionary that allows you to specify
10772        custom renaming of columns in the transcripts infos table. By providing a mapping of original
10773        column names to new column names in this dictionary, you can rename specific columns during the
10774        process
10775        :type column_rename: dict
10776        :param column_clean: The `column_clean` parameter in the
10777        `create_transcript_view_from_column_format` function is a boolean flag that determines whether
10778        the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns
10779        will be cleaned during the creation of the transcript view based on the specified column format,
10780        defaults to False
10781        :type column_clean: bool (optional)
10782        :param column_case: The `column_case` parameter in the
10783        `create_transcript_view_from_column_format` function is used to specify the case transformation
10784        to be applied to the columns in the transcript view. It can be set to either "upper" or "lower"
10785        to convert the column names to uppercase or lowercase, respectively
10786        :type column_case: str
10787        :return: The `create_transcript_view_from_column_format` function returns two lists:
10788        `temporary_tables` and `annotation_fields`.
10789        """
10790
10791        log.debug("Start transcrpts view creation from column format...")
10792
10793        #  "from_column_format": [
10794        #     {
10795        #         "transcripts_column": "ANN",
10796        #         "transcripts_infos_column": "Feature_ID",
10797        #     }
10798        # ],
10799
10800        # Init
10801        if temporary_tables is None:
10802            temporary_tables = []
10803        if annotation_fields is None:
10804            annotation_fields = []
10805
10806        for column_format in column_formats:
10807
10808            # annotation field and transcript annotation field
10809            annotation_field = column_format.get("transcripts_column", "ANN")
10810            transcript_annotation = column_format.get(
10811                "transcripts_infos_column", "Feature_ID"
10812            )
10813
10814            # Transcripts infos columns rename
10815            column_rename = column_format.get("column_rename", column_rename)
10816
10817            # Transcripts infos columns clean
10818            column_clean = column_format.get("column_clean", column_clean)
10819
10820            # Transcripts infos columns case
10821            column_case = column_format.get("column_case", column_case)
10822
10823            # Temporary View name
10824            temporary_view_name = transcripts_table + "".join(
10825                random.choices(string.ascii_uppercase + string.digits, k=10)
10826            )
10827
10828            # Create temporary view name
10829            temporary_view_name = self.annotation_format_to_table(
10830                uniquify=True,
10831                annotation_field=annotation_field,
10832                view_name=temporary_view_name,
10833                annotation_id=transcript_annotation,
10834                column_rename=column_rename,
10835                column_clean=column_clean,
10836                column_case=column_case,
10837            )
10838
10839            # Annotation fields
10840            if temporary_view_name:
10841                query_annotation_fields = f"""
10842                    SELECT *
10843                    FROM (
10844                        DESCRIBE SELECT *
10845                        FROM {temporary_view_name}
10846                        )
10847                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
10848                """
10849                df_annotation_fields = self.get_query_to_df(
10850                    query=query_annotation_fields
10851                )
10852
10853                # Add temporary view and annotation fields
10854                temporary_tables.append(temporary_view_name)
10855                annotation_fields += list(set(df_annotation_fields["column_name"]))
10856
10857        return temporary_tables, annotation_fields
10858
10859    def create_transcript_view(
10860        self,
10861        transcripts_table: str = None,
10862        transcripts_table_drop: bool = False,
10863        param: dict = {},
10864    ) -> str:
10865        """
10866        The `create_transcript_view` function generates a transcript view by processing data from a
10867        specified table based on provided parameters and structural information.
10868
10869        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
10870        is used to specify the name of the table that will store the final transcript view data. If a table
10871        name is not provided, the function will create a new table to store the transcript view data, and by
10872        default,, defaults to transcripts
10873        :type transcripts_table: str (optional)
10874        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
10875        `create_transcript_view` function is a boolean parameter that determines whether to drop the
10876        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
10877        the function will drop the existing transcripts table if it exists, defaults to False
10878        :type transcripts_table_drop: bool (optional)
10879        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
10880        contains information needed to create a transcript view. It includes details such as the structure
10881        of the transcripts, columns mapping, column formats, and other necessary information for generating
10882        the view. This parameter allows for flexibility and customization
10883        :type param: dict
10884        :return: The `create_transcript_view` function returns the name of the transcripts table that was
10885        created or modified during the execution of the function.
10886        """
10887
10888        log.debug("Start transcripts view creation...")
10889
10890        # Default
10891        transcripts_table_default = "transcripts"
10892
10893        # Param
10894        if not param:
10895            param = self.get_param()
10896
10897        # Struct
10898        struct = param.get("transcripts", {}).get("struct", None)
10899
10900        # Transcript veresion
10901        transcript_id_remove_version = param.get("transcripts", {}).get(
10902            "transcript_id_remove_version", False
10903        )
10904
10905        # Transcripts mapping
10906        transcript_id_mapping_file = param.get("transcripts", {}).get(
10907            "transcript_id_mapping_file", None
10908        )
10909
10910        # Transcripts mapping
10911        transcript_id_mapping_force = param.get("transcripts", {}).get(
10912            "transcript_id_mapping_force", None
10913        )
10914
10915        # Transcripts table
10916        if transcripts_table is None:
10917            transcripts_table = param.get("transcripts", {}).get(
10918                "table", transcripts_table_default
10919            )
10920
10921        # Check transcripts table exists
10922        if transcripts_table:
10923
10924            # Query to check if transcripts table exists
10925            query_check_table = f"""
10926                SELECT * 
10927                FROM information_schema.tables 
10928                WHERE table_name = '{transcripts_table}'
10929            """
10930            df_check_table = self.get_query_to_df(query=query_check_table)
10931
10932            # Check if transcripts table exists
10933            if len(df_check_table) > 0 and not transcripts_table_drop:
10934                log.debug(f"Table {transcripts_table} exists and not drop option")
10935                return transcripts_table
10936
10937        if struct:
10938
10939            # added_columns
10940            added_columns = []
10941
10942            # Temporary tables
10943            temporary_tables = []
10944
10945            # Annotation fields
10946            annotation_fields = []
10947
10948            # from columns map
10949            columns_maps = struct.get("from_columns_map", [])
10950            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10951                self.create_transcript_view_from_columns_map(
10952                    transcripts_table=transcripts_table,
10953                    columns_maps=columns_maps,
10954                    added_columns=added_columns,
10955                    temporary_tables=temporary_tables,
10956                    annotation_fields=annotation_fields,
10957                )
10958            )
10959            added_columns += added_columns_tmp
10960            temporary_tables += temporary_tables_tmp
10961            annotation_fields += annotation_fields_tmp
10962
10963            # from column format
10964            column_formats = struct.get("from_column_format", [])
10965            temporary_tables_tmp, annotation_fields_tmp = (
10966                self.create_transcript_view_from_column_format(
10967                    transcripts_table=transcripts_table,
10968                    column_formats=column_formats,
10969                    temporary_tables=temporary_tables,
10970                    annotation_fields=annotation_fields,
10971                )
10972            )
10973            temporary_tables += temporary_tables_tmp
10974            annotation_fields += annotation_fields_tmp
10975
10976            # Remove some specific fields/column
10977            annotation_fields = list(set(annotation_fields))
10978            for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]:
10979                if field in annotation_fields:
10980                    annotation_fields.remove(field)
10981
10982            # Merge temporary tables query
10983            query_merge = ""
10984            for temporary_table in list(set(temporary_tables)):
10985
10986                # First temporary table
10987                if not query_merge:
10988                    query_merge = f"""
10989                        SELECT * FROM {temporary_table}
10990                    """
10991                # other temporary table (using UNION)
10992                else:
10993                    query_merge += f"""
10994                        UNION BY NAME SELECT * FROM {temporary_table}
10995                    """
10996
10997            # transcript table tmp
10998            transcript_table_tmp = "transcripts_tmp"
10999            transcript_table_tmp2 = "transcripts_tmp2"
11000            transcript_table_tmp3 = "transcripts_tmp3"
11001
11002            # Merge on transcript
11003            query_merge_on_transcripts_annotation_fields = []
11004
11005            # Add transcript list
11006            query_merge_on_transcripts_annotation_fields.append(
11007                f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """
11008            )
11009
11010            # Aggregate all annotations fields
11011            for annotation_field in set(annotation_fields):
11012                query_merge_on_transcripts_annotation_fields.append(
11013                    f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """
11014                )
11015
11016            # Transcripts mapping
11017            if transcript_id_mapping_file:
11018
11019                # Transcript dataframe
11020                transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe"
11021                transcript_id_mapping_dataframe = transcripts_file_to_df(
11022                    transcript_id_mapping_file, column_names=["transcript", "alias"]
11023                )
11024
11025                # Transcript version remove
11026                if transcript_id_remove_version:
11027                    query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped"
11028                    query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)"
11029                    query_left_join = f"""
11030                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
11031                    """
11032                else:
11033                    query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped"
11034                    query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript"
11035                    query_left_join = f"""
11036                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
11037                    """
11038
11039                # Transcript column for group by merge
11040                query_transcript_merge_group_by = """
11041                        CASE
11042                            WHEN transcript_mapped NOT IN ('')
11043                            THEN split_part(transcript_mapped, '.', 1)
11044                            ELSE split_part(transcript_original, '.', 1)
11045                        END
11046                    """
11047
11048                # Merge query
11049                transcripts_tmp2_query = f"""
11050                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)}
11051                    FROM ({query_merge}) AS {transcript_table_tmp}
11052                    {query_left_join}
11053                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by}
11054                """
11055
11056                # Retrive columns after mege
11057                transcripts_tmp2_describe_query = f"""
11058                    DESCRIBE {transcripts_tmp2_query}
11059                """
11060                transcripts_tmp2_describe_list = list(
11061                    self.get_query_to_df(query=transcripts_tmp2_describe_query)[
11062                        "column_name"
11063                    ]
11064                )
11065
11066                # Create list of columns for select clause
11067                transcripts_tmp2_describe_select_clause = []
11068                for field in transcripts_tmp2_describe_list:
11069                    if field not in [
11070                        "#CHROM",
11071                        "POS",
11072                        "REF",
11073                        "ALT",
11074                        "INFO",
11075                        "transcript_mapped",
11076                    ]:
11077                        as_field = field
11078                        if field in ["transcript_original"]:
11079                            as_field = "transcripts_mapped"
11080                        transcripts_tmp2_describe_select_clause.append(
11081                            f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """
11082                        )
11083
11084                # Merge with mapping
11085                query_merge_on_transcripts = f"""
11086                    SELECT
11087                        "#CHROM", POS, REF, ALT, INFO,
11088                        CASE
11089                            WHEN ANY_VALUE(transcript_mapped) NOT IN ('')
11090                            THEN ANY_VALUE(transcript_mapped)
11091                            ELSE ANY_VALUE(transcript_original)
11092                        END AS transcript,
11093                        {", ".join(transcripts_tmp2_describe_select_clause)}
11094                    FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2}
11095                    GROUP BY "#CHROM", POS, REF, ALT, INFO,
11096                        {query_transcript_merge_group_by}
11097                """
11098
11099                # Add transcript filter from mapping file
11100                if transcript_id_mapping_force:
11101                    query_merge_on_transcripts = f"""
11102                        SELECT *
11103                        FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3}
11104                        WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe)
11105                    """
11106
11107            # No transcript mapping
11108            else:
11109
11110                # Remove transcript version
11111                if transcript_id_remove_version:
11112                    query_transcript_column = f"""
11113                        split_part({transcript_table_tmp}.transcript, '.', 1)
11114                    """
11115                else:
11116                    query_transcript_column = """
11117                        transcript
11118                    """
11119
11120                # Query sections
11121                query_transcript_column_select = (
11122                    f"{query_transcript_column} AS transcript"
11123                )
11124                query_transcript_column_group_by = query_transcript_column
11125
11126                # Query for transcripts view
11127                query_merge_on_transcripts = f"""
11128                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)}
11129                    FROM ({query_merge}) AS {transcript_table_tmp}
11130                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column}
11131                """
11132
11133            # Drop transcript view is necessary
11134            if transcripts_table_drop:
11135                query_drop = f"""
11136                    DROP TABLE IF EXISTS {transcripts_table};
11137                """
11138                self.execute_query(query=query_drop)
11139
11140            # # Merge and create transcript view
11141            # query_create_view = f"""
11142            #     CREATE TABLE IF NOT EXISTS {transcripts_table}
11143            #     AS {query_merge_on_transcripts}
11144            # """
11145            # self.execute_query(query=query_create_view)
11146
11147            # Using #CHROM chunk
11148            ######
11149
11150            # List of unique #CHROM
11151            query_unique_chrom = f"""
11152                SELECT DISTINCT "#CHROM"
11153                FROM variants AS subquery
11154            """
11155            unique_chroms = self.get_query_to_df(query=query_unique_chrom)
11156
11157            # Create table with structure but without data, if not exists
11158            query_create_table = f"""
11159                CREATE TABLE IF NOT EXISTS {transcripts_table} AS
11160                SELECT * FROM ({query_merge_on_transcripts}) AS subquery LIMIT 0
11161            """
11162            self.execute_query(query=query_create_table)
11163
11164            # Process by #CHROM
11165            for chrom in unique_chroms["#CHROM"]:
11166
11167                # Log
11168                log.debug(f"Processing #CHROM={chrom}")
11169
11170                # Select data by #CHROM
11171                query_chunk = f"""
11172                    SELECT *
11173                    FROM ({query_merge_on_transcripts})
11174                    WHERE "#CHROM" = '{chrom}'
11175                """
11176
11177                # Insert data
11178                query_insert_chunk = f"""
11179                    INSERT INTO {transcripts_table}
11180                    {query_chunk}
11181                """
11182                self.execute_query(query=query_insert_chunk)
11183
11184            # Remove temporary tables
11185            if temporary_tables:
11186                for temporary_table in list(set(temporary_tables)):
11187                    query_drop_tmp_table = f"""
11188                        DROP TABLE IF EXISTS {temporary_table}
11189                    """
11190                    self.execute_query(query=query_drop_tmp_table)
11191
11192            # Remove added columns
11193            for added_column in added_columns:
11194                self.drop_column(column=added_column)
11195
11196        else:
11197
11198            transcripts_table = None
11199
11200        return transcripts_table
11201
11202    def annotation_format_to_table(
11203        self,
11204        uniquify: bool = True,
11205        annotation_field: str = "ANN",
11206        annotation_id: str = "Feature_ID",
11207        view_name: str = "transcripts",
11208        column_rename: dict = {},
11209        column_clean: bool = False,
11210        column_case: str = None,
11211    ) -> str:
11212        """
11213        The `annotation_format_to_table` function converts annotation data from a VCF file into a
11214        structured table format, ensuring unique values and creating a temporary table for further
11215        processing or analysis.
11216
11217        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure
11218        unique values in the output or not. If set to `True`, the function will make sure that the
11219        output values are unique, defaults to True
11220        :type uniquify: bool (optional)
11221        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file
11222        that contains the annotation information for each variant. This field is used to extract the
11223        annotation details for further processing in the function. By default, it is set to "ANN",
11224        defaults to ANN
11225        :type annotation_field: str (optional)
11226        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method
11227        is used to specify the identifier for the annotation feature. This identifier will be used as a
11228        column name in the resulting table or view that is created based on the annotation data. It
11229        helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
11230        :type annotation_id: str (optional)
11231        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used
11232        to specify the name of the temporary table that will be created to store the transformed
11233        annotation data. This table will hold the extracted information from the annotation field in a
11234        structured format for further processing or analysis. By default,, defaults to transcripts
11235        :type view_name: str (optional)
11236        :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method
11237        is a dictionary that allows you to specify custom renaming for columns. By providing key-value
11238        pairs in this dictionary, you can rename specific columns in the resulting table or view that is
11239        created based on the annotation data. This feature enables
11240        :type column_rename: dict
11241        :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is
11242        a boolean flag that determines whether the annotation field should undergo a cleaning process.
11243        If set to `True`, the function will clean the annotation field before further processing. This
11244        cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults
11245        to False
11246        :type column_clean: bool (optional)
11247        :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is
11248        used to specify the case transformation to be applied to the column names extracted from the
11249        annotation data. It allows you to set the case of the column names to either lowercase or
11250        uppercase for consistency or other specific requirements during the conversion
11251        :type column_case: str
11252        :return: The function `annotation_format_to_table` is returning the name of the view created,
11253        which is stored in the variable `view_name`.
11254        """
11255
11256        # Annotation field
11257        annotation_format = "annotation_explode"
11258
11259        # Transcript annotation
11260        if column_rename:
11261            annotation_id = column_rename.get(annotation_id, annotation_id)
11262
11263        if column_clean:
11264            annotation_id = clean_annotation_field(annotation_id)
11265
11266        # Prefix
11267        prefix = self.get_explode_infos_prefix()
11268        if prefix:
11269            prefix = "INFO/"
11270
11271        # Annotation fields
11272        annotation_infos = prefix + annotation_field
11273        annotation_format_infos = prefix + annotation_format
11274
11275        # Variants table
11276        table_variants = self.get_table_variants()
11277
11278        # Header
11279        vcf_reader = self.get_header()
11280
11281        # Add columns
11282        added_columns = []
11283
11284        # Explode HGVS field in column
11285        added_columns += self.explode_infos(fields=[annotation_field])
11286
11287        if annotation_field in vcf_reader.infos:
11288
11289            # Extract ANN header
11290            ann_description = vcf_reader.infos[annotation_field].desc
11291            pattern = r"'(.+?)'"
11292            match = re.search(pattern, ann_description)
11293            if match:
11294                ann_header_match = match.group(1).split(" | ")
11295                ann_header = []
11296                ann_header_desc = {}
11297                for i in range(len(ann_header_match)):
11298                    ann_header_info = "".join(
11299                        char for char in ann_header_match[i] if char.isalnum()
11300                    )
11301                    ann_header.append(ann_header_info)
11302                    ann_header_desc[ann_header_info] = ann_header_match[i]
11303                if not ann_header_desc:
11304                    raise ValueError("Invalid header description format")
11305            else:
11306                raise ValueError("Invalid header description format")
11307
11308            # Create variant id
11309            variant_id_column = self.get_variant_id_column()
11310            added_columns += [variant_id_column]
11311
11312            # Get list of #CHROM
11313            query_unique_chrom = f"""
11314                SELECT DISTINCT "#CHROM"
11315                FROM variants AS subquery
11316            """
11317            unique_chroms = self.get_query_to_df(query=query_unique_chrom)
11318
11319            # Base for database anontation format
11320            dataframe_annotation_format_base = f"""
11321                SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}"
11322                FROM {table_variants}
11323            """
11324
11325            # Create dataframe for keys column type
11326            dataframe_annotation_format = self.get_query_to_df(
11327                f""" {dataframe_annotation_format_base} LIMIT 1000 """
11328            )
11329
11330            # Define a vectorized function to apply explode_annotation_format
11331            vectorized_explode_annotation_format = np.vectorize(
11332                lambda x: explode_annotation_format(
11333                    annotation=str(x),
11334                    uniquify=uniquify,
11335                    output_format="JSON",
11336                    prefix="",
11337                    header=list(ann_header_desc.values()),
11338                )
11339            )
11340
11341            # Assign the exploded annotations back to the dataframe
11342            dataframe_annotation_format[annotation_format_infos] = (
11343                vectorized_explode_annotation_format(
11344                    dataframe_annotation_format[annotation_infos].to_numpy()
11345                )
11346            )
11347
11348            # Find keys
11349            query_json = f"""
11350                SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key'
11351                FROM dataframe_annotation_format;
11352            """
11353            df_keys = self.get_query_to_df(query=query_json)
11354
11355            # Check keys
11356            query_json_key = []
11357            for _, row in df_keys.iterrows():
11358
11359                # Key
11360                key = row.iloc[0]
11361                key_clean = key
11362
11363                # key rename
11364                if column_rename:
11365                    key_clean = column_rename.get(key_clean, key_clean)
11366
11367                # key clean
11368                if column_clean:
11369                    key_clean = clean_annotation_field(key_clean)
11370
11371                # Key case
11372                if column_case:
11373                    if column_case.lower() in ["lower"]:
11374                        key_clean = key_clean.lower()
11375                    elif column_case.lower() in ["upper"]:
11376                        key_clean = key_clean.upper()
11377
11378                # Type
11379                query_json_type = f"""
11380                    SELECT * 
11381                    FROM (
11382                        SELECT 
11383                            NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '') AS '{key_clean}'
11384                        FROM
11385                            dataframe_annotation_format
11386                        )
11387                    WHERE "{key_clean}" NOT NULL AND "{key_clean}" NOT IN ('')
11388                """
11389
11390                # Get DataFrame from query
11391                df_json_type = self.get_query_to_df(query=query_json_type)
11392
11393                # Detect column type
11394                column_type = detect_column_type(df_json_type[key_clean])
11395
11396                # Free up memory
11397                del df_json_type
11398
11399                # Append
11400                query_json_key.append(
11401                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
11402                )
11403
11404            # Create table with structure but without data, if not exists
11405            query_create_table = f"""
11406                CREATE TABLE IF NOT EXISTS {view_name}
11407                AS (
11408                    SELECT *, {annotation_id} AS 'transcript'
11409                    FROM (
11410                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
11411                        FROM dataframe_annotation_format
11412                        )
11413                    LIMIT 0
11414                    );
11415            """
11416            self.execute_query(query=query_create_table)
11417
11418            # Free up memory
11419            del dataframe_annotation_format
11420
11421            # Insert data by chromosome
11422            for chrom in unique_chroms["#CHROM"]:
11423
11424                # Log
11425                log.debug(f"Processing #CHROM={chrom}")
11426
11427                # Create dataframe
11428                dataframe_annotation_format = self.get_query_to_df(
11429                    f""" {dataframe_annotation_format_base}  WHERE "#CHROM" = '{chrom}' """
11430                )
11431
11432                # Define a vectorized function to apply explode_annotation_format
11433                vectorized_explode_annotation_format = np.vectorize(
11434                    lambda x: explode_annotation_format(
11435                        annotation=str(x),
11436                        uniquify=uniquify,
11437                        output_format="JSON",
11438                        prefix="",
11439                        header=list(ann_header_desc.values()),
11440                    )
11441                )
11442
11443                # Assign the exploded annotations back to the dataframe
11444                dataframe_annotation_format[annotation_format_infos] = (
11445                    vectorized_explode_annotation_format(
11446                        dataframe_annotation_format[annotation_infos].to_numpy()
11447                    )
11448                )
11449
11450                # Insert data into tmp table
11451                query_insert_chunk = f"""
11452                    INSERT INTO {view_name}
11453                    SELECT *, {annotation_id} AS 'transcript'
11454                    FROM (
11455                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
11456                        FROM dataframe_annotation_format
11457                        )
11458                """
11459                self.execute_query(query=query_insert_chunk)
11460
11461                # Free up memory
11462                del dataframe_annotation_format
11463
11464        else:
11465
11466            # Return None
11467            view_name = None
11468
11469        # Remove added columns
11470        for added_column in added_columns:
11471            self.drop_column(column=added_column)
11472
11473        return view_name
11474
11475    def transcript_view_to_variants(
11476        self,
11477        transcripts_table: str = None,
11478        transcripts_column_id: str = None,
11479        transcripts_info_json: str = None,
11480        transcripts_info_field_json: str = None,
11481        transcripts_info_format: str = None,
11482        transcripts_info_field_format: str = None,
11483        param: dict = {},
11484    ) -> bool:
11485        """
11486        The `transcript_view_to_variants` function updates a variants table with information from
11487        transcripts in JSON format.
11488
11489        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
11490        table containing the transcripts data. If this parameter is not provided, the function will
11491        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
11492        :type transcripts_table: str
11493        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
11494        column in the `transcripts_table` that contains the unique identifier for each transcript. This
11495        identifier is used to match transcripts with variants in the database
11496        :type transcripts_column_id: str
11497        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
11498        of the column in the variants table where the transcripts information will be stored in JSON
11499        format. This parameter allows you to define the column in the variants table that will hold the
11500        JSON-formatted information about transcripts
11501        :type transcripts_info_json: str
11502        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
11503        specify the field in the VCF header that will contain information about transcripts in JSON
11504        format. This field will be added to the VCF header as an INFO field with the specified name
11505        :type transcripts_info_field_json: str
11506        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
11507        format of the information about transcripts that will be stored in the variants table. This
11508        format can be used to define how the transcript information will be structured or displayed
11509        within the variants table
11510        :type transcripts_info_format: str
11511        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
11512        specify the field in the VCF header that will contain information about transcripts in a
11513        specific format. This field will be added to the VCF header as an INFO field with the specified
11514        name
11515        :type transcripts_info_field_format: str
11516        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
11517        that contains various configuration settings related to transcripts. It is used to provide
11518        default values for certain parameters if they are not explicitly provided when calling the
11519        method. The `param` dictionary can be passed as an argument
11520        :type param: dict
11521        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
11522        if the operation is successful and `False` if certain conditions are not met.
11523        """
11524
11525        msg_info_prefix = "Start transcripts view to variants annotations"
11526
11527        log.debug(f"{msg_info_prefix}...")
11528
11529        # Default
11530        transcripts_table_default = "transcripts"
11531        transcripts_column_id_default = "transcript"
11532        transcripts_info_json_default = None
11533        transcripts_info_format_default = None
11534        transcripts_info_field_json_default = None
11535        transcripts_info_field_format_default = None
11536
11537        # Param
11538        if not param:
11539            param = self.get_param()
11540
11541        # Transcripts table
11542        if transcripts_table is None:
11543            transcripts_table = param.get("transcripts", {}).get(
11544                "table", transcripts_table_default
11545            )
11546
11547        # Transcripts column ID
11548        if transcripts_column_id is None:
11549            transcripts_column_id = param.get("transcripts", {}).get(
11550                "column_id", transcripts_column_id_default
11551            )
11552
11553        # Transcripts info json
11554        if transcripts_info_json is None:
11555            transcripts_info_json = param.get("transcripts", {}).get(
11556                "transcripts_info_json", transcripts_info_json_default
11557            )
11558
11559        # Transcripts info field JSON
11560        if transcripts_info_field_json is None:
11561            transcripts_info_field_json = param.get("transcripts", {}).get(
11562                "transcripts_info_field_json", transcripts_info_field_json_default
11563            )
11564        # if transcripts_info_field_json is not None and transcripts_info_json is None:
11565        #     transcripts_info_json = transcripts_info_field_json
11566
11567        # Transcripts info format
11568        if transcripts_info_format is None:
11569            transcripts_info_format = param.get("transcripts", {}).get(
11570                "transcripts_info_format", transcripts_info_format_default
11571            )
11572
11573        # Transcripts info field FORMAT
11574        if transcripts_info_field_format is None:
11575            transcripts_info_field_format = param.get("transcripts", {}).get(
11576                "transcripts_info_field_format", transcripts_info_field_format_default
11577            )
11578        # if (
11579        #     transcripts_info_field_format is not None
11580        #     and transcripts_info_format is None
11581        # ):
11582        #     transcripts_info_format = transcripts_info_field_format
11583
11584        # Variants table
11585        table_variants = self.get_table_variants()
11586
11587        # Check info columns param
11588        if (
11589            transcripts_info_json is None
11590            and transcripts_info_field_json is None
11591            and transcripts_info_format is None
11592            and transcripts_info_field_format is None
11593        ):
11594            return False
11595
11596        # Transcripts infos columns
11597        query_transcripts_infos_columns = f"""
11598            SELECT *
11599            FROM (
11600                DESCRIBE SELECT * FROM {transcripts_table}
11601                )
11602            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
11603        """
11604        transcripts_infos_columns = list(
11605            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
11606        )
11607
11608        # View results
11609        clause_select = []
11610        clause_to_json = []
11611        clause_to_format = []
11612        for field in transcripts_infos_columns:
11613            # Do not consider INFO field for export into fields
11614            if field not in ["INFO"]:
11615                clause_select.append(
11616                    f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """
11617                )
11618                clause_to_json.append(f""" '{field}': "{field}" """)
11619                clause_to_format.append(f""" "{field}" """)
11620
11621        # Update
11622        update_set_json = []
11623        update_set_format = []
11624
11625        # VCF header
11626        vcf_reader = self.get_header()
11627
11628        # Transcripts to info column in JSON
11629        if transcripts_info_json:
11630
11631            # Create column on variants table
11632            self.add_column(
11633                table_name=table_variants,
11634                column_name=transcripts_info_json,
11635                column_type="JSON",
11636                default_value=None,
11637                drop=False,
11638            )
11639
11640            # Add header
11641            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
11642                transcripts_info_json,
11643                ".",
11644                "String",
11645                "Transcripts in JSON format",
11646                "unknwon",
11647                "unknwon",
11648                self.code_type_map["String"],
11649            )
11650
11651            # Add to update
11652            update_set_json.append(
11653                f""" {transcripts_info_json}=t.{transcripts_info_json} """
11654            )
11655
11656        # Transcripts to info field in JSON
11657        if transcripts_info_field_json:
11658
11659            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
11660
11661            # Add to update
11662            update_set_json.append(
11663                f""" 
11664                    INFO = concat(
11665                            CASE
11666                                WHEN INFO NOT IN ('', '.')
11667                                THEN INFO
11668                                ELSE ''
11669                            END,
11670                            CASE
11671                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
11672                                THEN concat(
11673                                    ';{transcripts_info_field_json}=',
11674                                    t.{transcripts_info_json}
11675                                )
11676                                ELSE ''
11677                            END
11678                            )
11679                """
11680            )
11681
11682            # Add header
11683            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
11684                transcripts_info_field_json,
11685                ".",
11686                "String",
11687                "Transcripts in JSON format",
11688                "unknwon",
11689                "unknwon",
11690                self.code_type_map["String"],
11691            )
11692
11693        if update_set_json:
11694
11695            # Update query
11696            query_update = f"""
11697                UPDATE {table_variants}
11698                    SET {", ".join(update_set_json)}
11699                FROM
11700                (
11701                    SELECT
11702                        "#CHROM", POS, REF, ALT,
11703                            concat(
11704                            '{{',
11705                            string_agg(
11706                                '"' || "{transcripts_column_id}" || '":' ||
11707                                to_json(json_output)
11708                            ),
11709                            '}}'
11710                            )::JSON AS {transcripts_info_json}
11711                    FROM
11712                        (
11713                        SELECT
11714                            "#CHROM", POS, REF, ALT,
11715                            "{transcripts_column_id}",
11716                            to_json(
11717                                {{{",".join(clause_to_json)}}}
11718                            )::JSON AS json_output
11719                        FROM
11720                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11721                        WHERE "{transcripts_column_id}" IS NOT NULL
11722                        )
11723                    GROUP BY "#CHROM", POS, REF, ALT
11724                ) AS t
11725                WHERE {table_variants}."#CHROM" = t."#CHROM"
11726                    AND {table_variants}."POS" = t."POS"
11727                    AND {table_variants}."REF" = t."REF"
11728                    AND {table_variants}."ALT" = t."ALT"
11729            """
11730
11731            self.execute_query(query=query_update)
11732
11733        # Transcripts to info column in FORMAT
11734        if transcripts_info_format:
11735
11736            # Create column on variants table
11737            self.add_column(
11738                table_name=table_variants,
11739                column_name=transcripts_info_format,
11740                column_type="VARCHAR",
11741                default_value=None,
11742                drop=False,
11743            )
11744
11745            # Add header
11746            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
11747                transcripts_info_format,
11748                ".",
11749                "String",
11750                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11751                "unknwon",
11752                "unknwon",
11753                self.code_type_map["String"],
11754            )
11755
11756            # Add to update
11757            update_set_format.append(
11758                f""" {transcripts_info_format}=t.{transcripts_info_format} """
11759            )
11760
11761        else:
11762
11763            # Set variable for internal queries
11764            transcripts_info_format = "transcripts_info_format"
11765
11766        # Transcripts to info field in JSON
11767        if transcripts_info_field_format:
11768
11769            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
11770
11771            # Add to update
11772            update_set_format.append(
11773                f""" 
11774                    INFO = concat(
11775                            CASE
11776                                WHEN INFO NOT IN ('', '.')
11777                                THEN INFO
11778                                ELSE ''
11779                            END,
11780                            CASE
11781                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
11782                                THEN concat(
11783                                    ';{transcripts_info_field_format}=',
11784                                    t.{transcripts_info_format}
11785                                )
11786                                ELSE ''
11787                            END
11788                            )
11789                """
11790            )
11791
11792            # Add header
11793            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
11794                transcripts_info_field_format,
11795                ".",
11796                "String",
11797                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11798                "unknwon",
11799                "unknwon",
11800                self.code_type_map["String"],
11801            )
11802
11803        if update_set_format:
11804
11805            # Update query
11806            query_update = f"""
11807                UPDATE {table_variants}
11808                    SET {", ".join(update_set_format)}
11809                FROM
11810                (
11811                    SELECT
11812                        "#CHROM", POS, REF, ALT,
11813                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
11814                    FROM 
11815                        (
11816                        SELECT
11817                            "#CHROM", POS, REF, ALT,
11818                            "{transcripts_column_id}",
11819                            concat(
11820                                "{transcripts_column_id}",
11821                                '|',
11822                                {", '|', ".join(clause_to_format)}
11823                            ) AS {transcripts_info_format}
11824                        FROM
11825                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11826                        )
11827                    GROUP BY "#CHROM", POS, REF, ALT
11828                ) AS t
11829                WHERE {table_variants}."#CHROM" = t."#CHROM"
11830                    AND {table_variants}."POS" = t."POS"
11831                    AND {table_variants}."REF" = t."REF"
11832                    AND {table_variants}."ALT" = t."ALT"
11833            """
11834
11835            self.execute_query(query=query_update)
11836
11837        return True
11838
11839    def rename_info_fields(
11840        self, fields_to_rename: dict = None, table: str = None
11841    ) -> dict:
11842        """
11843        The `rename_info_fields` function renames specified fields in a VCF file header and updates
11844        corresponding INFO fields in the variants table.
11845
11846        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the
11847        mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary
11848        represent the original field names that need to be renamed, and the corresponding values
11849        represent the new names to which the fields should be
11850        :type fields_to_rename: dict
11851        :param table: The `table` parameter in the `rename_info_fields` function represents the name of
11852        the table in which the variants data is stored. This table contains information about genetic
11853        variants, and the function updates the corresponding INFO fields in this table when renaming
11854        specified fields in the VCF file header
11855        :type table: str
11856        :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains
11857        the original field names as keys and their corresponding new names (or None if the field was
11858        removed) as values after renaming or removing specified fields in a VCF file header and updating
11859        corresponding INFO fields in the variants table.
11860        """
11861
11862        # Init
11863        fields_renamed = {}
11864        config = self.get_config()
11865        access = config.get("access")
11866
11867        if table is None:
11868            table = self.get_table_variants()
11869
11870        # regexp replace fonction
11871        regex_replace_dict = {}
11872        regex_replace_nb = 0
11873        regex_replace_partition = 125
11874        regex_replace = "concat(INFO, ';')"  # Add ';' to reduce regexp comlexity
11875
11876        if fields_to_rename is not None and access not in ["RO"]:
11877
11878            log.info("Rename or remove fields...")
11879
11880            # Header
11881            header = self.get_header()
11882
11883            for field_to_rename, field_renamed in fields_to_rename.items():
11884
11885                if field_to_rename in header.infos:
11886
11887                    # Rename header
11888                    if field_renamed is not None:
11889                        header.infos[field_renamed] = vcf.parser._Info(
11890                            field_renamed,
11891                            header.infos[field_to_rename].num,
11892                            header.infos[field_to_rename].type,
11893                            header.infos[field_to_rename].desc,
11894                            header.infos[field_to_rename].source,
11895                            header.infos[field_to_rename].version,
11896                            header.infos[field_to_rename].type_code,
11897                        )
11898                    del header.infos[field_to_rename]
11899
11900                    # Rename INFO patterns
11901                    field_pattern = rf"(^|;)({field_to_rename})(=[^;]*)?;"
11902                    if field_renamed is not None:
11903                        field_renamed_pattern = rf"\1{field_renamed}\3;"
11904                    else:
11905                        field_renamed_pattern = r"\1"
11906
11907                    # regexp replace
11908                    regex_replace_nb += 1
11909                    regex_replace_key = math.floor(
11910                        regex_replace_nb / regex_replace_partition
11911                    )
11912                    if (regex_replace_nb % regex_replace_partition) == 0:
11913                        regex_replace = "concat(INFO, ';')"
11914                    regex_replace = f"regexp_replace({regex_replace}, '{field_pattern}', '{field_renamed_pattern}')"
11915                    regex_replace_dict[regex_replace_key] = regex_replace
11916
11917                    # Return
11918                    fields_renamed[field_to_rename] = field_renamed
11919
11920                    # Log
11921                    if field_renamed is not None:
11922                        log.info(
11923                            f"Rename or remove fields - field '{field_to_rename}' renamed to '{field_renamed}'"
11924                        )
11925                    else:
11926                        log.info(
11927                            f"Rename or remove fields - field '{field_to_rename}' removed"
11928                        )
11929
11930                else:
11931
11932                    log.warning(
11933                        f"Rename or remove fields - field '{field_to_rename}' not in header"
11934                    )
11935
11936            # Rename INFO
11937            for regex_replace_key, regex_replace in regex_replace_dict.items():
11938                log.info(
11939                    f"Rename or remove fields - Process [{regex_replace_key+1}/{len(regex_replace_dict)}]..."
11940                )
11941                query = f"""
11942                    UPDATE {table}
11943                    SET
11944                        INFO = regexp_replace({regex_replace}, ';$', '')
11945                """
11946                log.debug(f"query={query}")
11947                self.execute_query(query=query)
11948
11949        return fields_renamed
11950
11951    def calculation_rename_info_fields(
11952        self,
11953        fields_to_rename: dict = None,
11954        table: str = None,
11955        operation_name: str = "RENAME_INFO_FIELDS",
11956    ) -> None:
11957        """
11958        The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates
11959        fields to rename and table if provided, and then calls another function to rename the fields.
11960
11961        :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be
11962        renamed in a table. Each key-value pair in the dictionary represents the original field name as
11963        the key and the new field name as the value
11964        :type fields_to_rename: dict
11965        :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to
11966        specify the name of the table for which the fields are to be renamed. It is a string type
11967        parameter
11968        :type table: str
11969        :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields`
11970        method is a string that specifies the name of the operation being performed. In this context, it
11971        is used as a default value for the operation name if not explicitly provided when calling the
11972        function, defaults to RENAME_INFO_FIELDS
11973        :type operation_name: str (optional)
11974        """
11975
11976        # Param
11977        param = self.get_param()
11978
11979        # Get param fields to rename
11980        param_fields_to_rename = (
11981            param.get("calculation", {})
11982            .get("calculations", {})
11983            .get(operation_name, {})
11984            .get("fields_to_rename", None)
11985        )
11986
11987        # Get param table
11988        param_table = (
11989            param.get("calculation", {})
11990            .get("calculations", {})
11991            .get(operation_name, {})
11992            .get("table", None)
11993        )
11994
11995        # Init fields_to_rename
11996        if fields_to_rename is None:
11997            fields_to_rename = param_fields_to_rename
11998
11999        # Init table
12000        if table is None:
12001            table = param_table
12002
12003        renamed_fields = self.rename_info_fields(
12004            fields_to_rename=fields_to_rename, table=table
12005        )
12006
12007        log.debug(f"renamed_fields:{renamed_fields}")
class Variants:
   37class Variants:
   38
   39    def __init__(
   40        self,
   41        conn=None,
   42        input: str = None,
   43        output: str = None,
   44        config: dict = {},
   45        param: dict = {},
   46        load: bool = False,
   47    ) -> None:
   48        """
   49        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
   50        header
   51
   52        :param conn: the connection to the database
   53        :param input: the input file
   54        :param output: the output file
   55        :param config: a dictionary containing the configuration of the model
   56        :param param: a dictionary containing the parameters of the model
   57        """
   58
   59        # Init variables
   60        self.init_variables()
   61
   62        # Input
   63        self.set_input(input)
   64
   65        # Config
   66        self.set_config(config)
   67
   68        # Param
   69        self.set_param(param)
   70
   71        # Output
   72        self.set_output(output)
   73
   74        # connexion
   75        self.set_connexion(conn)
   76
   77        # Header
   78        self.set_header()
   79
   80        # Samples
   81        self.set_samples()
   82
   83        # Load data
   84        if load:
   85            self.load_data()
   86
   87    def set_samples(self, samples: list = None) -> list:
   88        """
   89        The function `set_samples` sets the samples attribute of an object to a provided list or
   90        retrieves it from a parameter dictionary.
   91
   92        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
   93        input and sets the `samples` attribute of the class to the provided list. If no samples are
   94        provided, it tries to get the samples from the class's parameters using the `get_param` method
   95        :type samples: list
   96        :return: The `samples` list is being returned.
   97        """
   98
   99        if not samples:
  100            samples = self.get_param().get("samples", {}).get("list", None)
  101
  102        self.samples = samples
  103
  104        return samples
  105
  106    def get_samples(self) -> list:
  107        """
  108        This function returns a list of samples.
  109        :return: The `get_samples` method is returning the `samples` attribute of the object.
  110        """
  111
  112        return self.samples
  113
  114    def get_samples_check(self) -> bool:
  115        """
  116        This function returns the value of the "check" key within the "samples" dictionary retrieved
  117        from the parameters.
  118        :return: The method `get_samples_check` is returning the value of the key "check" inside the
  119        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
  120        method. If the key "check" is not found, it will return `False`.
  121        """
  122
  123        return self.get_param().get("samples", {}).get("check", True)
  124
  125    def set_input(self, input: str = None) -> None:
  126        """
  127        The function `set_input` takes a file name as input, extracts the name and extension, and sets
  128        attributes in the class accordingly.
  129
  130        :param input: The `set_input` method in the provided code snippet is used to set attributes
  131        related to the input file. Here's a breakdown of the parameters and their usage in the method:
  132        :type input: str
  133        """
  134
  135        if input and not isinstance(input, str):
  136            try:
  137                self.input = input.name
  138            except:
  139                log.error(f"Input file '{input} in bad format")
  140                raise ValueError(f"Input file '{input} in bad format")
  141        else:
  142            self.input = input
  143
  144        # Input format
  145        if input:
  146            input_name, input_extension = os.path.splitext(self.input)
  147            self.input_name = input_name
  148            self.input_extension = input_extension
  149            self.input_format = self.input_extension.replace(".", "")
  150
  151    def set_config(self, config: dict) -> None:
  152        """
  153        The set_config function takes a config object and assigns it as the configuration object for the
  154        class.
  155
  156        :param config: The `config` parameter in the `set_config` function is a dictionary object that
  157        contains configuration settings for the class. When you call the `set_config` function with a
  158        dictionary object as the argument, it will set that dictionary as the configuration object for
  159        the class
  160        :type config: dict
  161        """
  162
  163        self.config = config
  164
  165    def set_param(self, param: dict) -> None:
  166        """
  167        This function sets a parameter object for the class based on the input dictionary.
  168
  169        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
  170        as the `param` attribute of the class instance
  171        :type param: dict
  172        """
  173
  174        self.param = param
  175
  176    def init_variables(self) -> None:
  177        """
  178        This function initializes the variables that will be used in the rest of the class
  179        """
  180
  181        self.prefix = "howard"
  182        self.table_variants = "variants"
  183        self.dataframe = None
  184
  185        self.comparison_map = {
  186            "gt": ">",
  187            "gte": ">=",
  188            "lt": "<",
  189            "lte": "<=",
  190            "equals": "=",
  191            "contains": "SIMILAR TO",
  192        }
  193
  194        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
  195
  196        self.code_type_map_to_sql = {
  197            "Integer": "INTEGER",
  198            "String": "VARCHAR",
  199            "Float": "FLOAT",
  200            "Flag": "VARCHAR",
  201        }
  202
  203        self.index_additionnal_fields = []
  204
  205    def get_indexing(self) -> bool:
  206        """
  207        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
  208        returns False.
  209        :return: The value of the indexing parameter.
  210        """
  211
  212        return self.get_param().get("indexing", False)
  213
  214    def get_connexion_config(self) -> dict:
  215        """
  216        The function `get_connexion_config` returns a dictionary containing the configuration for a
  217        connection, including the number of threads and memory limit.
  218        :return: a dictionary containing the configuration for the Connexion library.
  219        """
  220
  221        # config
  222        config = self.get_config()
  223
  224        # Connexion config
  225        connexion_config = {}
  226        threads = self.get_threads()
  227
  228        # Threads
  229        if threads:
  230            connexion_config["threads"] = threads
  231
  232        # Memory
  233        # if config.get("memory", None):
  234        #     connexion_config["memory_limit"] = config.get("memory")
  235        if self.get_memory():
  236            connexion_config["memory_limit"] = self.get_memory()
  237
  238        # Temporary directory
  239        if config.get("tmp", None):
  240            connexion_config["temp_directory"] = config.get("tmp")
  241
  242        # Access
  243        if config.get("access", None):
  244            access = config.get("access")
  245            if access in ["RO"]:
  246                access = "READ_ONLY"
  247            elif access in ["RW"]:
  248                access = "READ_WRITE"
  249            connexion_db = self.get_connexion_db()
  250            if connexion_db in ":memory:":
  251                access = "READ_WRITE"
  252            connexion_config["access_mode"] = access
  253
  254        return connexion_config
  255
  256    def get_duckdb_settings(self) -> dict:
  257        """
  258        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
  259        string.
  260        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
  261        """
  262
  263        # config
  264        config = self.get_config()
  265
  266        # duckdb settings
  267        duckdb_settings_dict = {}
  268        if config.get("duckdb_settings", None):
  269            duckdb_settings = config.get("duckdb_settings")
  270            duckdb_settings = full_path(duckdb_settings)
  271            # duckdb setting is a file
  272            if os.path.exists(duckdb_settings):
  273                with open(duckdb_settings) as json_file:
  274                    duckdb_settings_dict = yaml.safe_load(json_file)
  275            # duckdb settings is a string
  276            else:
  277                duckdb_settings_dict = json.loads(duckdb_settings)
  278
  279        return duckdb_settings_dict
  280
  281    def set_connexion_db(self) -> str:
  282        """
  283        The function `set_connexion_db` returns the appropriate database connection string based on the
  284        input format and connection type.
  285        :return: the value of the variable `connexion_db`.
  286        """
  287
  288        # Default connexion db
  289        default_connexion_db = ":memory:"
  290
  291        # Find connexion db
  292        if self.get_input_format() in ["db", "duckdb"]:
  293            connexion_db = self.get_input()
  294        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
  295            connexion_db = default_connexion_db
  296        elif self.get_connexion_type() in ["tmpfile"]:
  297            tmp_name = tempfile.mkdtemp(
  298                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
  299            )
  300            connexion_db = f"{tmp_name}/tmp.db"
  301        elif self.get_connexion_type() != "":
  302            connexion_db = self.get_connexion_type()
  303        else:
  304            connexion_db = default_connexion_db
  305
  306        # Set connexion db
  307        self.connexion_db = connexion_db
  308
  309        return connexion_db
  310
  311    def set_connexion(self, conn) -> None:
  312        """
  313        The function `set_connexion` creates a connection to a database, with options for different
  314        database formats and settings.
  315
  316        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
  317        database. If a connection is not provided, a new connection to an in-memory database is created.
  318        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
  319        sqlite
  320        """
  321
  322        # Connexion db
  323        connexion_db = self.set_connexion_db()
  324
  325        # Connexion config
  326        connexion_config = self.get_connexion_config()
  327
  328        # Connexion format
  329        connexion_format = self.get_config().get("connexion_format", "duckdb")
  330        # Set connexion format
  331        self.connexion_format = connexion_format
  332
  333        # Connexion
  334        if not conn:
  335            if connexion_format in ["duckdb"]:
  336                conn = duckdb.connect(connexion_db, config=connexion_config)
  337                # duckDB settings
  338                duckdb_settings = self.get_duckdb_settings()
  339                if duckdb_settings:
  340                    for setting in duckdb_settings:
  341                        setting_value = duckdb_settings.get(setting)
  342                        if isinstance(setting_value, str):
  343                            setting_value = f"'{setting_value}'"
  344                        conn.execute(f"PRAGMA {setting}={setting_value};")
  345            elif connexion_format in ["sqlite"]:
  346                conn = sqlite3.connect(connexion_db)
  347
  348        # Set connexion
  349        self.conn = conn
  350
  351        # Log
  352        log.debug(f"connexion_format: {connexion_format}")
  353        log.debug(f"connexion_db: {connexion_db}")
  354        log.debug(f"connexion config: {connexion_config}")
  355        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
  356
  357    def set_output(self, output: str = None) -> None:
  358        """
  359        The `set_output` function in Python sets the output file based on the input or a specified key
  360        in the config file, extracting the output name, extension, and format.
  361
  362        :param output: The `output` parameter in the `set_output` method is used to specify the name of
  363        the output file. If the config file has an 'output' key, the method sets the output to the value
  364        of that key. If no output is provided, it sets the output to `None`
  365        :type output: str
  366        """
  367
  368        if output and not isinstance(output, str):
  369            self.output = output.name
  370        else:
  371            self.output = output
  372
  373        # Output format
  374        if self.output:
  375            output_name, output_extension = os.path.splitext(self.output)
  376            self.output_name = output_name
  377            self.output_extension = output_extension
  378            self.output_format = self.output_extension.replace(".", "")
  379        else:
  380            self.output_name = None
  381            self.output_extension = None
  382            self.output_format = None
  383
  384    def set_header(self) -> None:
  385        """
  386        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
  387        """
  388
  389        input_file = self.get_input()
  390        default_header_list = [
  391            "##fileformat=VCFv4.2",
  392            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
  393        ]
  394
  395        # Full path
  396        input_file = full_path(input_file)
  397
  398        if input_file:
  399
  400            input_format = self.get_input_format()
  401            input_compressed = self.get_input_compressed()
  402            config = self.get_config()
  403            header_list = default_header_list
  404            if input_format in [
  405                "vcf",
  406                "hdr",
  407                "tsv",
  408                "csv",
  409                "psv",
  410                "parquet",
  411                "db",
  412                "duckdb",
  413            ]:
  414                # header provided in param
  415                if config.get("header_file", None):
  416                    with open(config.get("header_file"), "rt") as f:
  417                        header_list = self.read_vcf_header(f)
  418                # within a vcf file format (header within input file itsself)
  419                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
  420                    # within a compressed vcf file format (.vcf.gz)
  421                    if input_compressed:
  422                        with bgzf.open(input_file, "rt") as f:
  423                            header_list = self.read_vcf_header(f)
  424                    # within an uncompressed vcf file format (.vcf)
  425                    else:
  426                        with open(input_file, "rt") as f:
  427                            header_list = self.read_vcf_header(f)
  428                # header provided in default external file .hdr
  429                elif os.path.exists((input_file + ".hdr")):
  430                    with open(input_file + ".hdr", "rt") as f:
  431                        header_list = self.read_vcf_header(f)
  432                else:
  433                    try:  # Try to get header info fields and file columns
  434
  435                        with tempfile.TemporaryDirectory() as tmpdir:
  436
  437                            # Create database
  438                            db_for_header = Database(database=input_file)
  439
  440                            # Get header columns for infos fields
  441                            db_header_from_columns = (
  442                                db_for_header.get_header_from_columns()
  443                            )
  444
  445                            # Get real columns in the file
  446                            db_header_columns = db_for_header.get_columns()
  447
  448                            # Write header file
  449                            header_file_tmp = os.path.join(tmpdir, "header")
  450                            f = open(header_file_tmp, "w")
  451                            vcf.Writer(f, db_header_from_columns)
  452                            f.close()
  453
  454                            # Replace #CHROM line with rel columns
  455                            header_list = db_for_header.read_header_file(
  456                                header_file=header_file_tmp
  457                            )
  458                            header_list[-1] = "\t".join(db_header_columns)
  459
  460                    except:
  461
  462                        log.warning(
  463                            f"No header for file {input_file}. Set as default VCF header"
  464                        )
  465                        header_list = default_header_list
  466
  467            else:  # try for unknown format ?
  468
  469                log.error(f"Input file format '{input_format}' not available")
  470                raise ValueError(f"Input file format '{input_format}' not available")
  471
  472            if not header_list:
  473                header_list = default_header_list
  474
  475            # header as list
  476            self.header_list = header_list
  477
  478            # header as VCF object
  479            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
  480
  481        else:
  482
  483            self.header_list = None
  484            self.header_vcf = None
  485
  486    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
  487        """
  488        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
  489        DataFrame based on the connection format.
  490
  491        :param query: The `query` parameter in the `get_query_to_df` function is a string that
  492        represents the SQL query you want to execute. This query will be used to fetch data from a
  493        database and convert it into a pandas DataFrame
  494        :type query: str
  495        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
  496        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
  497        function will only fetch up to that number of rows from the database query result. If no limit
  498        is specified,
  499        :type limit: int
  500        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
  501        """
  502
  503        # Connexion format
  504        connexion_format = self.get_connexion_format()
  505
  506        # Limit in query
  507        if limit:
  508            pd.set_option("display.max_rows", limit)
  509            if connexion_format in ["duckdb"]:
  510                df = (
  511                    self.conn.execute(query)
  512                    .fetch_record_batch(limit)
  513                    .read_next_batch()
  514                    .to_pandas()
  515                )
  516            elif connexion_format in ["sqlite"]:
  517                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
  518
  519        # Full query
  520        else:
  521            if connexion_format in ["duckdb"]:
  522                df = self.conn.execute(query).df()
  523            elif connexion_format in ["sqlite"]:
  524                df = pd.read_sql_query(query, self.conn)
  525
  526        return df
  527
  528    def get_overview(self) -> None:
  529        """
  530        The function prints the input, output, config, and dataframe of the current object
  531        """
  532        table_variants_from = self.get_table_variants(clause="from")
  533        sql_columns = self.get_header_columns_as_sql()
  534        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
  535        df = self.get_query_to_df(sql_query_export)
  536        log.info(
  537            "Input:  "
  538            + str(self.get_input())
  539            + " ["
  540            + str(str(self.get_input_format()))
  541            + "]"
  542        )
  543        log.info(
  544            "Output: "
  545            + str(self.get_output())
  546            + " ["
  547            + str(str(self.get_output_format()))
  548            + "]"
  549        )
  550        log.info("Config: ")
  551        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
  552            "\n"
  553        ):
  554            log.info("\t" + str(d))
  555        log.info("Param: ")
  556        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
  557            "\n"
  558        ):
  559            log.info("\t" + str(d))
  560        log.info("Sample list: " + str(self.get_header_sample_list()))
  561        log.info("Dataframe: ")
  562        for d in str(df).split("\n"):
  563            log.info("\t" + str(d))
  564
  565        # garbage collector
  566        del df
  567        gc.collect()
  568
  569        return None
  570
  571    def get_stats(self) -> dict:
  572        """
  573        The `get_stats` function calculates and returns various statistics of the current object,
  574        including information about the input file, variants, samples, header fields, quality, and
  575        SNVs/InDels.
  576        :return: a dictionary containing various statistics of the current object. The dictionary has
  577        the following structure:
  578        """
  579
  580        # Log
  581        log.info(f"Stats Calculation...")
  582
  583        # table varaints
  584        table_variants_from = self.get_table_variants()
  585
  586        # stats dict
  587        stats = {"Infos": {}}
  588
  589        ### File
  590        input_file = self.get_input()
  591        stats["Infos"]["Input file"] = input_file
  592
  593        # Header
  594        header_infos = self.get_header().infos
  595        header_formats = self.get_header().formats
  596        header_infos_list = list(header_infos)
  597        header_formats_list = list(header_formats)
  598
  599        ### Variants
  600
  601        stats["Variants"] = {}
  602
  603        # Variants by chr
  604        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
  605        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
  606        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
  607            by=["CHROM"], kind="quicksort"
  608        )
  609
  610        # Total number of variants
  611        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
  612
  613        # Calculate percentage
  614        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
  615            lambda x: (x / nb_of_variants)
  616        )
  617
  618        stats["Variants"]["Number of variants by chromosome"] = (
  619            nb_of_variants_by_chrom.to_dict(orient="index")
  620        )
  621
  622        stats["Infos"]["Number of variants"] = int(nb_of_variants)
  623
  624        ### Samples
  625
  626        # Init
  627        samples = {}
  628        nb_of_samples = 0
  629
  630        # Check Samples
  631        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
  632            log.debug(f"Check samples...")
  633            for sample in self.get_header_sample_list():
  634                sql_query_samples = f"""
  635                    SELECT  '{sample}' as sample,
  636                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
  637                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
  638                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
  639                    FROM {table_variants_from}
  640                    WHERE (
  641                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
  642                        AND
  643                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
  644                      )
  645                    GROUP BY genotype
  646                    """
  647                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
  648                sample_genotype_count = sql_query_genotype_df["count"].sum()
  649                if len(sql_query_genotype_df):
  650                    nb_of_samples += 1
  651                    samples[f"{sample} - {sample_genotype_count} variants"] = (
  652                        sql_query_genotype_df.to_dict(orient="index")
  653                    )
  654
  655            stats["Samples"] = samples
  656            stats["Infos"]["Number of samples"] = nb_of_samples
  657
  658        # #
  659        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
  660        #     stats["Infos"]["Number of samples"] = nb_of_samples
  661        # elif nb_of_samples:
  662        #     stats["Infos"]["Number of samples"] = "not a VCF format"
  663
  664        ### INFO and FORMAT fields
  665        header_types_df = {}
  666        header_types_list = {
  667            "List of INFO fields": header_infos,
  668            "List of FORMAT fields": header_formats,
  669        }
  670        i = 0
  671        for header_type in header_types_list:
  672
  673            header_type_infos = header_types_list.get(header_type)
  674            header_infos_dict = {}
  675
  676            for info in header_type_infos:
  677
  678                i += 1
  679                header_infos_dict[i] = {}
  680
  681                # ID
  682                header_infos_dict[i]["id"] = info
  683
  684                # num
  685                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
  686                if header_type_infos[info].num in genotype_map.keys():
  687                    header_infos_dict[i]["Number"] = genotype_map.get(
  688                        header_type_infos[info].num
  689                    )
  690                else:
  691                    header_infos_dict[i]["Number"] = header_type_infos[info].num
  692
  693                # type
  694                if header_type_infos[info].type:
  695                    header_infos_dict[i]["Type"] = header_type_infos[info].type
  696                else:
  697                    header_infos_dict[i]["Type"] = "."
  698
  699                # desc
  700                if header_type_infos[info].desc != None:
  701                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
  702                else:
  703                    header_infos_dict[i]["Description"] = ""
  704
  705            if len(header_infos_dict):
  706                header_types_df[header_type] = pd.DataFrame.from_dict(
  707                    header_infos_dict, orient="index"
  708                ).to_dict(orient="index")
  709
  710        # Stats
  711        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
  712        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
  713        stats["Header"] = header_types_df
  714
  715        ### QUAL
  716        if "QUAL" in self.get_header_columns():
  717            sql_query_qual = f"""
  718                    SELECT
  719                        avg(CAST(QUAL AS INTEGER)) AS Average,
  720                        min(CAST(QUAL AS INTEGER)) AS Minimum,
  721                        max(CAST(QUAL AS INTEGER)) AS Maximum,
  722                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
  723                        median(CAST(QUAL AS INTEGER)) AS Median,
  724                        variance(CAST(QUAL AS INTEGER)) AS Variance
  725                    FROM {table_variants_from}
  726                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
  727                    """
  728
  729            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
  730            stats["Quality"] = {"Stats": qual}
  731
  732        ### SNV and InDel
  733
  734        sql_query_snv = f"""
  735            
  736            SELECT Type, count FROM (
  737
  738                    SELECT
  739                        'Total' AS Type,
  740                        count(*) AS count
  741                    FROM {table_variants_from}
  742
  743                    UNION
  744
  745                    SELECT
  746                        'MNV' AS Type,
  747                        count(*) AS count
  748                    FROM {table_variants_from}
  749                    WHERE len(REF) > 1 AND len(ALT) > 1
  750                    AND len(REF) = len(ALT)
  751
  752                    UNION
  753
  754                    SELECT
  755                        'InDel' AS Type,
  756                        count(*) AS count
  757                    FROM {table_variants_from}
  758                    WHERE len(REF) > 1 OR len(ALT) > 1
  759                    AND len(REF) != len(ALT)
  760                    
  761                    UNION
  762
  763                    SELECT
  764                        'SNV' AS Type,
  765                        count(*) AS count
  766                    FROM {table_variants_from}
  767                    WHERE len(REF) = 1 AND len(ALT) = 1
  768
  769                )
  770
  771            ORDER BY count DESC
  772
  773                """
  774        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
  775
  776        sql_query_snv_substitution = f"""
  777                SELECT
  778                    concat(REF, '>', ALT) AS 'Substitution',
  779                    count(*) AS count
  780                FROM {table_variants_from}
  781                WHERE len(REF) = 1 AND len(ALT) = 1
  782                GROUP BY REF, ALT
  783                ORDER BY count(*) DESC
  784                """
  785        snv_substitution = (
  786            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
  787        )
  788        stats["Variants"]["Counts"] = snv_indel
  789        stats["Variants"]["Substitutions"] = snv_substitution
  790
  791        return stats
  792
  793    def stats_to_file(self, file: str = None) -> str:
  794        """
  795        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
  796        into a JSON object, and writes the JSON object to the specified file.
  797
  798        :param file: The `file` parameter is a string that represents the file path where the JSON data
  799        will be written
  800        :type file: str
  801        :return: the name of the file that was written to.
  802        """
  803
  804        # Get stats
  805        stats = self.get_stats()
  806
  807        # Serializing json
  808        json_object = json.dumps(stats, indent=4)
  809
  810        # Writing to sample.json
  811        with open(file, "w") as outfile:
  812            outfile.write(json_object)
  813
  814        return file
  815
  816    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
  817        """
  818        The `print_stats` function generates a markdown file and prints the statistics contained in a
  819        JSON file in a formatted manner.
  820
  821        :param output_file: The `output_file` parameter is a string that specifies the path and filename
  822        of the output file where the stats will be printed in Markdown format. If no `output_file` is
  823        provided, a temporary directory will be created and the stats will be saved in a file named
  824        "stats.md" within that
  825        :type output_file: str
  826        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
  827        file where the statistics will be saved. If no value is provided, a temporary directory will be
  828        created and a default file name "stats.json" will be used
  829        :type json_file: str
  830        :return: The function `print_stats` does not return any value. It has a return type annotation
  831        of `None`.
  832        """
  833
  834        # Full path
  835        output_file = full_path(output_file)
  836        json_file = full_path(json_file)
  837
  838        with tempfile.TemporaryDirectory() as tmpdir:
  839
  840            # Files
  841            if not output_file:
  842                output_file = os.path.join(tmpdir, "stats.md")
  843            if not json_file:
  844                json_file = os.path.join(tmpdir, "stats.json")
  845
  846            # Create folders
  847            if not os.path.exists(os.path.dirname(output_file)):
  848                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
  849            if not os.path.exists(os.path.dirname(json_file)):
  850                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
  851
  852            # Create stats JSON file
  853            stats_file = self.stats_to_file(file=json_file)
  854
  855            # Print stats file
  856            with open(stats_file) as f:
  857                stats = yaml.safe_load(f)
  858
  859            # Output
  860            output_title = []
  861            output_index = []
  862            output = []
  863
  864            # Title
  865            output_title.append("# HOWARD Stats")
  866
  867            # Index
  868            output_index.append("## Index")
  869
  870            # Process sections
  871            for section in stats:
  872                infos = stats.get(section)
  873                section_link = "#" + section.lower().replace(" ", "-")
  874                output.append(f"## {section}")
  875                output_index.append(f"- [{section}]({section_link})")
  876
  877                if len(infos):
  878                    for info in infos:
  879                        try:
  880                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
  881                            is_df = True
  882                        except:
  883                            try:
  884                                df = pd.DataFrame.from_dict(
  885                                    json.loads((infos.get(info))), orient="index"
  886                                )
  887                                is_df = True
  888                            except:
  889                                is_df = False
  890                        if is_df:
  891                            output.append(f"### {info}")
  892                            info_link = "#" + info.lower().replace(" ", "-")
  893                            output_index.append(f"   - [{info}]({info_link})")
  894                            output.append(f"{df.to_markdown(index=False)}")
  895                        else:
  896                            output.append(f"- {info}: {infos.get(info)}")
  897                else:
  898                    output.append(f"NA")
  899
  900            # Write stats in markdown file
  901            with open(output_file, "w") as fp:
  902                for item in output_title:
  903                    fp.write("%s\n" % item)
  904                for item in output_index:
  905                    fp.write("%s\n" % item)
  906                for item in output:
  907                    fp.write("%s\n" % item)
  908
  909            # Output stats in markdown
  910            print("")
  911            print("\n\n".join(output_title))
  912            print("")
  913            print("\n\n".join(output))
  914            print("")
  915
  916        return None
  917
  918    def get_input(self) -> str:
  919        """
  920        It returns the value of the input variable.
  921        :return: The input is being returned.
  922        """
  923        return self.input
  924
  925    def get_input_format(self, input_file: str = None) -> str:
  926        """
  927        This function returns the format of the input variable, either from the provided input file or
  928        by prompting for input.
  929
  930        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
  931        represents the file path of the input file. If no `input_file` is provided when calling the
  932        method, it will default to `None`
  933        :type input_file: str
  934        :return: The format of the input variable is being returned.
  935        """
  936
  937        if not input_file:
  938            input_file = self.get_input()
  939        input_format = get_file_format(input_file)
  940        return input_format
  941
  942    def get_input_compressed(self, input_file: str = None) -> str:
  943        """
  944        The function `get_input_compressed` returns the format of the input variable after compressing
  945        it.
  946
  947        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
  948        that represents the file path of the input file. If no `input_file` is provided when calling the
  949        method, it will default to `None` and the method will then call `self.get_input()` to
  950        :type input_file: str
  951        :return: The function `get_input_compressed` returns the compressed format of the input
  952        variable.
  953        """
  954
  955        if not input_file:
  956            input_file = self.get_input()
  957        input_compressed = get_file_compressed(input_file)
  958        return input_compressed
  959
  960    def get_output(self) -> str:
  961        """
  962        It returns the output of the neuron.
  963        :return: The output of the neural network.
  964        """
  965
  966        return self.output
  967
  968    def get_output_format(self, output_file: str = None) -> str:
  969        """
  970        The function `get_output_format` returns the format of the input variable or the output file if
  971        provided.
  972
  973        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
  974        that represents the file path of the output file. If no `output_file` is provided when calling
  975        the method, it will default to the output obtained from the `get_output` method of the class
  976        instance. The
  977        :type output_file: str
  978        :return: The format of the input variable is being returned.
  979        """
  980
  981        if not output_file:
  982            output_file = self.get_output()
  983        output_format = get_file_format(output_file)
  984
  985        return output_format
  986
  987    def get_config(self) -> dict:
  988        """
  989        It returns the config
  990        :return: The config variable is being returned.
  991        """
  992        return self.config
  993
  994    def get_param(self) -> dict:
  995        """
  996        It returns the param
  997        :return: The param variable is being returned.
  998        """
  999        return self.param
 1000
 1001    def get_connexion_db(self) -> str:
 1002        """
 1003        It returns the connexion_db attribute of the object
 1004        :return: The connexion_db is being returned.
 1005        """
 1006        return self.connexion_db
 1007
 1008    def get_prefix(self) -> str:
 1009        """
 1010        It returns the prefix of the object.
 1011        :return: The prefix is being returned.
 1012        """
 1013        return self.prefix
 1014
 1015    def get_table_variants(self, clause: str = "select") -> str:
 1016        """
 1017        This function returns the table_variants attribute of the object
 1018
 1019        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
 1020        defaults to select (optional)
 1021        :return: The table_variants attribute of the object.
 1022        """
 1023
 1024        # Access
 1025        access = self.get_config().get("access", None)
 1026
 1027        # Clauses "select", "where", "update"
 1028        if clause in ["select", "where", "update"]:
 1029            table_variants = self.table_variants
 1030        # Clause "from"
 1031        elif clause in ["from"]:
 1032            # For Read Only
 1033            if self.get_input_format() in ["parquet"] and access in ["RO"]:
 1034                input_file = self.get_input()
 1035                table_variants = f"'{input_file}' as variants"
 1036            # For Read Write
 1037            else:
 1038                table_variants = f"{self.table_variants} as variants"
 1039        else:
 1040            table_variants = self.table_variants
 1041        return table_variants
 1042
 1043    def get_tmp_dir(self) -> str:
 1044        """
 1045        The function `get_tmp_dir` returns the temporary directory path based on configuration
 1046        parameters or a default path.
 1047        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
 1048        configuration, parameters, and a default value of "/tmp".
 1049        """
 1050
 1051        return get_tmp(
 1052            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
 1053        )
 1054
 1055    def get_connexion_type(self) -> str:
 1056        """
 1057        If the connexion type is not in the list of allowed connexion types, raise a ValueError
 1058
 1059        :return: The connexion type is being returned.
 1060        """
 1061        return self.get_config().get("connexion_type", "memory")
 1062
 1063    def get_connexion(self):
 1064        """
 1065        It returns the connection object
 1066
 1067        :return: The connection object.
 1068        """
 1069        return self.conn
 1070
 1071    def close_connexion(self) -> None:
 1072        """
 1073        This function closes the connection to the database.
 1074        :return: The connection is being closed.
 1075        """
 1076        return self.conn.close()
 1077
 1078    def get_header(self, type: str = "vcf"):
 1079        """
 1080        This function returns the header of the VCF file as a list of strings
 1081
 1082        :param type: the type of header you want to get, defaults to vcf (optional)
 1083        :return: The header of the vcf file.
 1084        """
 1085
 1086        if self.header_vcf:
 1087            if type == "vcf":
 1088                return self.header_vcf
 1089            elif type == "list":
 1090                return self.header_list
 1091        else:
 1092            if type == "vcf":
 1093                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 1094                return header
 1095            elif type == "list":
 1096                return vcf_required
 1097
 1098    def get_header_infos_list(self) -> list:
 1099        """
 1100        This function retrieves a list of information fields from the header.
 1101        :return: A list of information fields from the header.
 1102        """
 1103
 1104        # Init
 1105        infos_list = []
 1106
 1107        for field in self.get_header().infos:
 1108            infos_list.append(field)
 1109
 1110        return infos_list
 1111
 1112    def get_header_length(self, file: str = None) -> int:
 1113        """
 1114        The function `get_header_length` returns the length of the header list, excluding the #CHROM
 1115        line.
 1116
 1117        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
 1118        header file. If this argument is provided, the function will read the header from the specified
 1119        file and return the length of the header list minus 1 (to exclude the #CHROM line)
 1120        :type file: str
 1121        :return: the length of the header list, excluding the #CHROM line.
 1122        """
 1123
 1124        if file:
 1125            return len(self.read_vcf_header_file(file=file)) - 1
 1126        elif self.get_header(type="list"):
 1127            return len(self.get_header(type="list")) - 1
 1128        else:
 1129            return 0
 1130
 1131    def get_header_columns(self) -> str:
 1132        """
 1133        This function returns the header list of a VCF
 1134
 1135        :return: The length of the header list.
 1136        """
 1137        if self.get_header():
 1138            return self.get_header(type="list")[-1]
 1139        else:
 1140            return ""
 1141
 1142    def get_header_columns_as_list(self) -> list:
 1143        """
 1144        This function returns the header list of a VCF
 1145
 1146        :return: The length of the header list.
 1147        """
 1148        if self.get_header():
 1149            return self.get_header_columns().strip().split("\t")
 1150        else:
 1151            return []
 1152
 1153    def get_header_columns_as_sql(self) -> str:
 1154        """
 1155        This function retruns header length (without #CHROM line)
 1156
 1157        :return: The length of the header list.
 1158        """
 1159        sql_column_list = []
 1160        for col in self.get_header_columns_as_list():
 1161            sql_column_list.append(f'"{col}"')
 1162        return ",".join(sql_column_list)
 1163
 1164    def get_header_sample_list(
 1165        self, check: bool = False, samples: list = None, samples_force: bool = False
 1166    ) -> list:
 1167        """
 1168        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
 1169        checking and filtering based on input parameters.
 1170
 1171        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
 1172        parameter that determines whether to check if the samples in the list are properly defined as
 1173        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
 1174        list is defined as a, defaults to False
 1175        :type check: bool (optional)
 1176        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
 1177        allows you to specify a subset of samples from the header. If you provide a list of sample
 1178        names, the function will check if each sample is defined in the header. If a sample is not found
 1179        in the
 1180        :type samples: list
 1181        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
 1182        a boolean parameter that determines whether to force the function to return the sample list
 1183        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
 1184        function will return the sample list without performing, defaults to False
 1185        :type samples_force: bool (optional)
 1186        :return: The function `get_header_sample_list` returns a list of samples based on the input
 1187        parameters and conditions specified in the function.
 1188        """
 1189
 1190        # Init
 1191        samples_list = []
 1192
 1193        if samples is None:
 1194            samples_list = self.header_vcf.samples
 1195        else:
 1196            samples_checked = []
 1197            for sample in samples:
 1198                if sample in self.header_vcf.samples:
 1199                    samples_checked.append(sample)
 1200                else:
 1201                    log.warning(f"Sample '{sample}' not defined in header")
 1202            samples_list = samples_checked
 1203
 1204            # Force sample list without checking if is_genotype_column
 1205            if samples_force:
 1206                log.warning(f"Samples {samples_list} not checked if genotypes")
 1207                return samples_list
 1208
 1209        if check:
 1210            samples_checked = []
 1211            for sample in samples_list:
 1212                if self.is_genotype_column(column=sample):
 1213                    samples_checked.append(sample)
 1214                else:
 1215                    log.warning(
 1216                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
 1217                    )
 1218            samples_list = samples_checked
 1219
 1220        # Return samples list
 1221        return samples_list
 1222
 1223    def is_genotype_column(self, column: str = None) -> bool:
 1224        """
 1225        This function checks if a given column is a genotype column in a database.
 1226
 1227        :param column: The `column` parameter in the `is_genotype_column` method is a string that
 1228        represents the column name in a database table. This method checks if the specified column is a
 1229        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
 1230        method of
 1231        :type column: str
 1232        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
 1233        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
 1234        column name and returns the result. If the `column` parameter is None, it returns False.
 1235        """
 1236
 1237        if column is not None:
 1238            return Database(database=self.get_input()).is_genotype_column(column=column)
 1239        else:
 1240            return False
 1241
 1242    def get_verbose(self) -> bool:
 1243        """
 1244        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
 1245        exist
 1246
 1247        :return: The value of the key "verbose" in the config dictionary.
 1248        """
 1249        return self.get_config().get("verbose", False)
 1250
 1251    def get_connexion_format(self) -> str:
 1252        """
 1253        It returns the connexion format of the object.
 1254        :return: The connexion_format is being returned.
 1255        """
 1256        connexion_format = self.connexion_format
 1257        if connexion_format not in ["duckdb", "sqlite"]:
 1258            log.error(f"Unknown connexion format {connexion_format}")
 1259            raise ValueError(f"Unknown connexion format {connexion_format}")
 1260        else:
 1261            return connexion_format
 1262
 1263    def insert_file_to_table(
 1264        self,
 1265        file,
 1266        columns: str,
 1267        header_len: int = 0,
 1268        sep: str = "\t",
 1269        chunksize: int = 1000000,
 1270    ) -> None:
 1271        """
 1272        The function reads a file in chunks and inserts each chunk into a table based on the specified
 1273        database format.
 1274
 1275        :param file: The `file` parameter is the file that you want to load into a table. It should be
 1276        the path to the file on your system
 1277        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
 1278        should contain the names of the columns in the table where the data will be inserted. The column
 1279        names should be separated by commas within the string. For example, if you have columns named
 1280        "id", "name
 1281        :type columns: str
 1282        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
 1283        the number of lines to skip at the beginning of the file before reading the actual data. This
 1284        parameter allows you to skip any header information present in the file before processing the
 1285        data, defaults to 0
 1286        :type header_len: int (optional)
 1287        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
 1288        separator character that is used in the file being read. In this case, the default separator is
 1289        set to `\t`, which represents a tab character. You can change this parameter to a different
 1290        separator character if, defaults to \t
 1291        :type sep: str (optional)
 1292        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
 1293        when processing the file in chunks. In the provided code snippet, the default value for
 1294        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
 1295        to 1000000
 1296        :type chunksize: int (optional)
 1297        """
 1298
 1299        # Config
 1300        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
 1301        connexion_format = self.get_connexion_format()
 1302
 1303        log.debug("chunksize: " + str(chunksize))
 1304
 1305        if chunksize:
 1306            for chunk in pd.read_csv(
 1307                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
 1308            ):
 1309                if connexion_format in ["duckdb"]:
 1310                    sql_insert_into = (
 1311                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
 1312                    )
 1313                    self.conn.execute(sql_insert_into)
 1314                elif connexion_format in ["sqlite"]:
 1315                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
 1316
 1317    def load_data(
 1318        self,
 1319        input_file: str = None,
 1320        drop_variants_table: bool = False,
 1321        sample_size: int = 20480,
 1322    ) -> None:
 1323        """
 1324        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
 1325        table before loading the data and specify a sample size.
 1326
 1327        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
 1328        table
 1329        :type input_file: str
 1330        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
 1331        determines whether the variants table should be dropped before loading the data. If set to
 1332        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
 1333        not be dropped, defaults to False
 1334        :type drop_variants_table: bool (optional)
 1335        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
 1336        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
 1337        20480
 1338        :type sample_size: int (optional)
 1339        """
 1340
 1341        log.info("Loading...")
 1342
 1343        # change input file
 1344        if input_file:
 1345            self.set_input(input_file)
 1346            self.set_header()
 1347
 1348        # drop variants table
 1349        if drop_variants_table:
 1350            self.drop_variants_table()
 1351
 1352        # get table variants
 1353        table_variants = self.get_table_variants()
 1354
 1355        # Access
 1356        access = self.get_config().get("access", None)
 1357        log.debug(f"access: {access}")
 1358
 1359        # Input format and compress
 1360        input_format = self.get_input_format()
 1361        input_compressed = self.get_input_compressed()
 1362        log.debug(f"input_format: {input_format}")
 1363        log.debug(f"input_compressed: {input_compressed}")
 1364
 1365        # input_compressed_format
 1366        if input_compressed:
 1367            input_compressed_format = "gzip"
 1368        else:
 1369            input_compressed_format = "none"
 1370        log.debug(f"input_compressed_format: {input_compressed_format}")
 1371
 1372        # Connexion format
 1373        connexion_format = self.get_connexion_format()
 1374
 1375        # Sample size
 1376        if not sample_size:
 1377            sample_size = -1
 1378        log.debug(f"sample_size: {sample_size}")
 1379
 1380        # Load data
 1381        log.debug(f"Load Data from {input_format}")
 1382
 1383        # DuckDB connexion
 1384        if connexion_format in ["duckdb"]:
 1385
 1386            # Database already exists
 1387            if self.input_format in ["db", "duckdb"]:
 1388
 1389                if connexion_format in ["duckdb"]:
 1390                    log.debug(f"Input file format '{self.input_format}' duckDB")
 1391                else:
 1392                    log.error(
 1393                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1394                    )
 1395                    raise ValueError(
 1396                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1397                    )
 1398
 1399            # Load from existing database format
 1400            else:
 1401
 1402                try:
 1403                    # Create Table or View
 1404                    database = Database(database=self.input)
 1405                    sql_from = database.get_sql_from(sample_size=sample_size)
 1406
 1407                    if access in ["RO"]:
 1408                        sql_load = (
 1409                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
 1410                        )
 1411                    else:
 1412                        sql_load = (
 1413                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
 1414                        )
 1415                    self.conn.execute(sql_load)
 1416
 1417                except:
 1418                    # Format not available
 1419                    log.error(f"Input file format '{self.input_format}' not available")
 1420                    raise ValueError(
 1421                        f"Input file format '{self.input_format}' not available"
 1422                    )
 1423
 1424        # SQLite connexion
 1425        elif connexion_format in ["sqlite"] and input_format in [
 1426            "vcf",
 1427            "tsv",
 1428            "csv",
 1429            "psv",
 1430        ]:
 1431
 1432            # Main structure
 1433            structure = {
 1434                "#CHROM": "VARCHAR",
 1435                "POS": "INTEGER",
 1436                "ID": "VARCHAR",
 1437                "REF": "VARCHAR",
 1438                "ALT": "VARCHAR",
 1439                "QUAL": "VARCHAR",
 1440                "FILTER": "VARCHAR",
 1441                "INFO": "VARCHAR",
 1442            }
 1443
 1444            # Strcuture with samples
 1445            structure_complete = structure
 1446            if self.get_header_sample_list():
 1447                structure["FORMAT"] = "VARCHAR"
 1448                for sample in self.get_header_sample_list():
 1449                    structure_complete[sample] = "VARCHAR"
 1450
 1451            # Columns list for create and insert
 1452            sql_create_table_columns = []
 1453            sql_create_table_columns_list = []
 1454            for column in structure_complete:
 1455                column_type = structure_complete[column]
 1456                sql_create_table_columns.append(
 1457                    f'"{column}" {column_type} default NULL'
 1458                )
 1459                sql_create_table_columns_list.append(f'"{column}"')
 1460
 1461            # Create database
 1462            log.debug(f"Create Table {table_variants}")
 1463            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
 1464            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
 1465            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
 1466            self.conn.execute(sql_create_table)
 1467
 1468            # chunksize define length of file chunk load file
 1469            chunksize = 100000
 1470
 1471            # delimiter
 1472            delimiter = file_format_delimiters.get(input_format, "\t")
 1473
 1474            # Load the input file
 1475            with open(self.input, "rt") as input_file:
 1476
 1477                # Use the appropriate file handler based on the input format
 1478                if input_compressed:
 1479                    input_file = bgzf.open(self.input, "rt")
 1480                if input_format in ["vcf"]:
 1481                    header_len = self.get_header_length()
 1482                else:
 1483                    header_len = 0
 1484
 1485                # Insert the file contents into a table
 1486                self.insert_file_to_table(
 1487                    input_file,
 1488                    columns=sql_create_table_columns_list_sql,
 1489                    header_len=header_len,
 1490                    sep=delimiter,
 1491                    chunksize=chunksize,
 1492                )
 1493
 1494        else:
 1495            log.error(
 1496                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1497            )
 1498            raise ValueError(
 1499                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1500            )
 1501
 1502        # Explode INFOS fields into table fields
 1503        if self.get_explode_infos():
 1504            self.explode_infos(
 1505                prefix=self.get_explode_infos_prefix(),
 1506                fields=self.get_explode_infos_fields(),
 1507                force=True,
 1508            )
 1509
 1510        # Create index after insertion
 1511        self.create_indexes()
 1512
 1513    def get_explode_infos(self) -> bool:
 1514        """
 1515        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
 1516        to False if it is not set.
 1517        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
 1518        value. If the parameter is not present, it will return False.
 1519        """
 1520
 1521        return self.get_param().get("explode", {}).get("explode_infos", False)
 1522
 1523    def get_explode_infos_fields(
 1524        self,
 1525        explode_infos_fields: str = None,
 1526        remove_fields_not_in_header: bool = False,
 1527    ) -> list:
 1528        """
 1529        The `get_explode_infos_fields` function returns a list of exploded information fields based on
 1530        the input parameter `explode_infos_fields`.
 1531
 1532        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
 1533        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
 1534        comma-separated list of field names to explode
 1535        :type explode_infos_fields: str
 1536        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
 1537        flag that determines whether to remove fields that are not present in the header. If it is set
 1538        to `True`, any field that is not in the header will be excluded from the list of exploded
 1539        information fields. If it is set to `, defaults to False
 1540        :type remove_fields_not_in_header: bool (optional)
 1541        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
 1542        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
 1543        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
 1544        Otherwise, it returns a list of exploded information fields after removing any spaces and
 1545        splitting the string by commas.
 1546        """
 1547
 1548        # If no fields, get it in param
 1549        if not explode_infos_fields:
 1550            explode_infos_fields = (
 1551                self.get_param().get("explode", {}).get("explode_infos_fields", None)
 1552            )
 1553
 1554        # If no fields, defined as all fields in header using keyword
 1555        if not explode_infos_fields:
 1556            explode_infos_fields = "*"
 1557
 1558        # If fields list not empty
 1559        if explode_infos_fields:
 1560
 1561            # Input fields list
 1562            if isinstance(explode_infos_fields, str):
 1563                fields_input = explode_infos_fields.split(",")
 1564            elif isinstance(explode_infos_fields, list):
 1565                fields_input = explode_infos_fields
 1566            else:
 1567                fields_input = []
 1568
 1569            # Fields list without * keyword
 1570            fields_without_all = fields_input.copy()
 1571            if "*".casefold() in (item.casefold() for item in fields_without_all):
 1572                fields_without_all.remove("*")
 1573
 1574            # Fields in header
 1575            fields_in_header = sorted(list(set(self.get_header().infos)))
 1576
 1577            # Construct list of fields
 1578            fields_output = []
 1579            for field in fields_input:
 1580
 1581                # Strip field
 1582                field = field.strip()
 1583
 1584                # format keyword * in regex
 1585                if field.upper() in ["*"]:
 1586                    field = ".*"
 1587
 1588                # Find all fields with pattern
 1589                r = re.compile(rf"^{field}$")
 1590                fields_search = sorted(list(filter(r.match, fields_in_header)))
 1591
 1592                # Remove fields input from search
 1593                if field in fields_search:
 1594                    fields_search = [field]
 1595                elif fields_search != [field]:
 1596                    fields_search = sorted(
 1597                        list(set(fields_search).difference(fields_input))
 1598                    )
 1599
 1600                # If field is not in header (avoid not well formatted header)
 1601                if not fields_search and not remove_fields_not_in_header:
 1602                    fields_search = [field]
 1603
 1604                # Add found fields
 1605                for new_field in fields_search:
 1606                    # Add field, if not already exists, and if it is in header (if asked)
 1607                    if (
 1608                        new_field not in fields_output
 1609                        and (
 1610                            not remove_fields_not_in_header
 1611                            or new_field in fields_in_header
 1612                        )
 1613                        and new_field not in [".*"]
 1614                    ):
 1615                        fields_output.append(new_field)
 1616
 1617            return fields_output
 1618
 1619        else:
 1620
 1621            return []
 1622
 1623    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
 1624        """
 1625        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
 1626        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
 1627        not provided.
 1628
 1629        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
 1630        prefix to be used for exploding or expanding information
 1631        :type explode_infos_prefix: str
 1632        :return: the value of the variable `explode_infos_prefix`.
 1633        """
 1634
 1635        if not explode_infos_prefix:
 1636            explode_infos_prefix = (
 1637                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
 1638            )
 1639
 1640        return explode_infos_prefix
 1641
 1642    def add_column(
 1643        self,
 1644        table_name,
 1645        column_name,
 1646        column_type,
 1647        default_value=None,
 1648        drop: bool = False,
 1649    ) -> dict:
 1650        """
 1651        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
 1652        doesn't already exist.
 1653
 1654        :param table_name: The name of the table to which you want to add a column
 1655        :param column_name: The parameter "column_name" is the name of the column that you want to add
 1656        to the table
 1657        :param column_type: The `column_type` parameter specifies the data type of the column that you
 1658        want to add to the table. It should be a string that represents the desired data type, such as
 1659        "INTEGER", "TEXT", "REAL", etc
 1660        :param default_value: The `default_value` parameter is an optional parameter that specifies the
 1661        default value for the newly added column. If a default value is provided, it will be assigned to
 1662        the column for any existing rows that do not have a value for that column
 1663        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
 1664        if it already exists in the table. If `drop` is set to `True`, the function will drop the
 1665        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
 1666        to False
 1667        :type drop: bool (optional)
 1668        :return: a boolean value indicating whether the column was successfully added to the table.
 1669        """
 1670
 1671        # added
 1672        added = False
 1673        dropped = False
 1674
 1675        # Check if the column already exists in the table
 1676        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1677        columns = self.get_query_to_df(query).columns.tolist()
 1678        if column_name.upper() in [c.upper() for c in columns]:
 1679            log.debug(
 1680                f"The {column_name} column already exists in the {table_name} table"
 1681            )
 1682            if drop:
 1683                self.drop_column(table_name=table_name, column_name=column_name)
 1684                dropped = True
 1685            else:
 1686                return None
 1687        else:
 1688            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1689
 1690        # Add column in table
 1691        add_column_query = (
 1692            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
 1693        )
 1694        if default_value is not None:
 1695            add_column_query += f" DEFAULT {default_value}"
 1696        self.execute_query(add_column_query)
 1697        added = not dropped
 1698        log.debug(
 1699            f"The {column_name} column was successfully added to the {table_name} table"
 1700        )
 1701
 1702        if added:
 1703            added_column = {
 1704                "table_name": table_name,
 1705                "column_name": column_name,
 1706                "column_type": column_type,
 1707                "default_value": default_value,
 1708            }
 1709        else:
 1710            added_column = None
 1711
 1712        return added_column
 1713
 1714    def drop_column(
 1715        self, column: dict = None, table_name: str = None, column_name: str = None
 1716    ) -> bool:
 1717        """
 1718        The `drop_column` function drops a specified column from a given table in a database and returns
 1719        True if the column was successfully dropped, and False if the column does not exist in the
 1720        table.
 1721
 1722        :param column: The `column` parameter is a dictionary that contains information about the column
 1723        you want to drop. It has two keys:
 1724        :type column: dict
 1725        :param table_name: The `table_name` parameter is the name of the table from which you want to
 1726        drop a column
 1727        :type table_name: str
 1728        :param column_name: The `column_name` parameter is the name of the column that you want to drop
 1729        from the table
 1730        :type column_name: str
 1731        :return: a boolean value. It returns True if the column was successfully dropped from the table,
 1732        and False if the column does not exist in the table.
 1733        """
 1734
 1735        # Find column infos
 1736        if column:
 1737            if isinstance(column, dict):
 1738                table_name = column.get("table_name", None)
 1739                column_name = column.get("column_name", None)
 1740            elif isinstance(column, str):
 1741                table_name = self.get_table_variants()
 1742                column_name = column
 1743            else:
 1744                table_name = None
 1745                column_name = None
 1746
 1747        if not table_name and not column_name:
 1748            return False
 1749
 1750        # Removed
 1751        removed = False
 1752
 1753        # Check if the column already exists in the table
 1754        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1755        columns = self.get_query_to_df(query).columns.tolist()
 1756        if column_name in columns:
 1757            log.debug(f"The {column_name} column exists in the {table_name} table")
 1758        else:
 1759            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1760            return False
 1761
 1762        # Add column in table # ALTER TABLE integers DROP k
 1763        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
 1764        self.execute_query(add_column_query)
 1765        removed = True
 1766        log.debug(
 1767            f"The {column_name} column was successfully dropped to the {table_name} table"
 1768        )
 1769
 1770        return removed
 1771
 1772    def explode_infos(
 1773        self,
 1774        prefix: str = None,
 1775        create_index: bool = False,
 1776        fields: list = None,
 1777        force: bool = False,
 1778        proccess_all_fields_together: bool = False,
 1779        table: str = None,
 1780    ) -> list:
 1781        """
 1782        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
 1783        individual columns, returning a list of added columns.
 1784
 1785        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
 1786        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
 1787        `self.get_explode_infos_prefix()` as the prefix
 1788        :type prefix: str
 1789        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
 1790        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
 1791        `False`, indexes will not be created. The default value is `False`, defaults to False
 1792        :type create_index: bool (optional)
 1793        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
 1794        that you want to explode into individual columns. If this parameter is not provided, all INFO
 1795        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
 1796        a list to the `
 1797        :type fields: list
 1798        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
 1799        determines whether to drop and recreate a column if it already exists in the table. If `force`
 1800        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
 1801        defaults to False
 1802        :type force: bool (optional)
 1803        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
 1804        flag that determines whether to process all the INFO fields together or individually. If set to
 1805        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
 1806        be processed individually. The default value is, defaults to False
 1807        :type proccess_all_fields_together: bool (optional)
 1808        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
 1809        of the table where the exploded INFO fields will be added as individual columns. If you provide
 1810        a value for the `table` parameter, the function will use that table name. If the `table`
 1811        parameter is
 1812        :type table: str
 1813        :return: The `explode_infos` function returns a list of added columns.
 1814        """
 1815
 1816        # drop indexes
 1817        self.drop_indexes()
 1818
 1819        # connexion format
 1820        connexion_format = self.get_connexion_format()
 1821
 1822        # Access
 1823        access = self.get_config().get("access", None)
 1824
 1825        # Added columns
 1826        added_columns = []
 1827
 1828        if access not in ["RO"]:
 1829
 1830            # prefix
 1831            if prefix in [None, True] or not isinstance(prefix, str):
 1832                if self.get_explode_infos_prefix() not in [None, True]:
 1833                    prefix = self.get_explode_infos_prefix()
 1834                else:
 1835                    prefix = "INFO/"
 1836
 1837            # table variants
 1838            if table is not None:
 1839                table_variants = table
 1840            else:
 1841                table_variants = self.get_table_variants(clause="select")
 1842
 1843            # extra infos
 1844            try:
 1845                extra_infos = self.get_extra_infos()
 1846            except:
 1847                extra_infos = []
 1848
 1849            # Header infos
 1850            header_infos = self.get_header().infos
 1851
 1852            log.debug(
 1853                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
 1854            )
 1855
 1856            sql_info_alter_table_array = []
 1857
 1858            # Info fields to check
 1859            fields_list = list(header_infos)
 1860            if fields:
 1861                fields_list += fields
 1862            fields_list = set(fields_list)
 1863
 1864            # If no fields
 1865            if not fields:
 1866                fields = []
 1867
 1868            # Translate fields if patterns
 1869            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
 1870
 1871            for info in fields:
 1872
 1873                info_id_sql = prefix + info
 1874
 1875                if (
 1876                    info in fields_list
 1877                    or prefix + info in fields_list
 1878                    or info in extra_infos
 1879                ):
 1880
 1881                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
 1882
 1883                    if info in header_infos:
 1884                        info_type = header_infos[info].type
 1885                        info_num = header_infos[info].num
 1886                    else:
 1887                        info_type = "String"
 1888                        info_num = 0
 1889
 1890                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
 1891                    if info_num != 1:
 1892                        type_sql = "VARCHAR"
 1893
 1894                    # Add field
 1895                    added_column = self.add_column(
 1896                        table_name=table_variants,
 1897                        column_name=info_id_sql,
 1898                        column_type=type_sql,
 1899                        default_value="null",
 1900                        drop=force,
 1901                    )
 1902
 1903                    if added_column:
 1904                        added_columns.append(added_column)
 1905
 1906                    if added_column or force:
 1907
 1908                        # add field to index
 1909                        self.index_additionnal_fields.append(info_id_sql)
 1910
 1911                        # Update field array
 1912                        if connexion_format in ["duckdb"]:
 1913                            update_info_field = f"""
 1914                            "{info_id_sql}" =
 1915                                CASE
 1916                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
 1917                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
 1918                                END
 1919                            """
 1920                        elif connexion_format in ["sqlite"]:
 1921                            update_info_field = f"""
 1922                                "{info_id_sql}" =
 1923                                    CASE
 1924                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
 1925                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
 1926                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
 1927                                    END
 1928                            """
 1929
 1930                        sql_info_alter_table_array.append(update_info_field)
 1931
 1932            if sql_info_alter_table_array:
 1933
 1934                # By chromosomes
 1935                try:
 1936                    chromosomes_list = list(
 1937                        self.get_query_to_df(
 1938                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
 1939                        )["#CHROM"]
 1940                    )
 1941                except:
 1942                    chromosomes_list = [None]
 1943
 1944                for chrom in chromosomes_list:
 1945                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
 1946
 1947                    # Where clause
 1948                    where_clause = ""
 1949                    if chrom and len(chromosomes_list) > 1:
 1950                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
 1951
 1952                    # Update table
 1953                    if proccess_all_fields_together:
 1954                        sql_info_alter_table_array_join = ", ".join(
 1955                            sql_info_alter_table_array
 1956                        )
 1957                        if sql_info_alter_table_array_join:
 1958                            sql_info_alter_table = f"""
 1959                                UPDATE {table_variants}
 1960                                SET {sql_info_alter_table_array_join}
 1961                                {where_clause}
 1962                                """
 1963                            log.debug(
 1964                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
 1965                            )
 1966                            # log.debug(sql_info_alter_table)
 1967                            self.conn.execute(sql_info_alter_table)
 1968                    else:
 1969                        sql_info_alter_num = 0
 1970                        for sql_info_alter in sql_info_alter_table_array:
 1971                            sql_info_alter_num += 1
 1972                            sql_info_alter_table = f"""
 1973                                UPDATE {table_variants}
 1974                                SET {sql_info_alter}
 1975                                {where_clause}
 1976                                """
 1977                            log.debug(
 1978                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
 1979                            )
 1980                            # log.debug(sql_info_alter_table)
 1981                            self.conn.execute(sql_info_alter_table)
 1982
 1983        # create indexes
 1984        if create_index:
 1985            self.create_indexes()
 1986
 1987        return added_columns
 1988
 1989    def create_indexes(self) -> None:
 1990        """
 1991        Create indexes on the table after insertion
 1992        """
 1993
 1994        # Access
 1995        access = self.get_config().get("access", None)
 1996
 1997        # get table variants
 1998        table_variants = self.get_table_variants("FROM")
 1999
 2000        if self.get_indexing() and access not in ["RO"]:
 2001            # Create index
 2002            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
 2003            self.conn.execute(sql_create_table_index)
 2004            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
 2005            self.conn.execute(sql_create_table_index)
 2006            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
 2007            self.conn.execute(sql_create_table_index)
 2008            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
 2009            self.conn.execute(sql_create_table_index)
 2010            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
 2011            self.conn.execute(sql_create_table_index)
 2012            for field in self.index_additionnal_fields:
 2013                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
 2014                self.conn.execute(sql_create_table_index)
 2015
 2016    def drop_indexes(self) -> None:
 2017        """
 2018        Create indexes on the table after insertion
 2019        """
 2020
 2021        # Access
 2022        access = self.get_config().get("access", None)
 2023
 2024        # get table variants
 2025        table_variants = self.get_table_variants("FROM")
 2026
 2027        # Get database format
 2028        connexion_format = self.get_connexion_format()
 2029
 2030        if access not in ["RO"]:
 2031            if connexion_format in ["duckdb"]:
 2032                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
 2033            elif connexion_format in ["sqlite"]:
 2034                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
 2035
 2036            list_indexes = self.conn.execute(sql_list_indexes)
 2037            index_names = [row[0] for row in list_indexes.fetchall()]
 2038            for index in index_names:
 2039                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
 2040                self.conn.execute(sql_drop_table_index)
 2041
 2042    def read_vcf_header(self, f) -> list:
 2043        """
 2044        It reads the header of a VCF file and returns a list of the header lines
 2045
 2046        :param f: the file object
 2047        :return: The header lines of the VCF file.
 2048        """
 2049
 2050        header_list = []
 2051        for line in f:
 2052            header_list.append(line)
 2053            if line.startswith("#CHROM"):
 2054                break
 2055        return header_list
 2056
 2057    def read_vcf_header_file(self, file: str = None) -> list:
 2058        """
 2059        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
 2060        uncompressed files.
 2061
 2062        :param file: The `file` parameter is a string that represents the path to the VCF header file
 2063        that you want to read. It is an optional parameter, so if you don't provide a value, it will
 2064        default to `None`
 2065        :type file: str
 2066        :return: The function `read_vcf_header_file` returns a list.
 2067        """
 2068
 2069        if self.get_input_compressed(input_file=file):
 2070            with bgzf.open(file, "rt") as f:
 2071                return self.read_vcf_header(f=f)
 2072        else:
 2073            with open(file, "rt") as f:
 2074                return self.read_vcf_header(f=f)
 2075
 2076    def execute_query(self, query: str):
 2077        """
 2078        It takes a query as an argument, executes it, and returns the results
 2079
 2080        :param query: The query to be executed
 2081        :return: The result of the query is being returned.
 2082        """
 2083        if query:
 2084            return self.conn.execute(query)  # .fetchall()
 2085        else:
 2086            return None
 2087
 2088    def export_output(
 2089        self,
 2090        output_file: str | None = None,
 2091        output_header: str | None = None,
 2092        export_header: bool = True,
 2093        query: str | None = None,
 2094        parquet_partitions: list | None = None,
 2095        chunk_size: int | None = None,
 2096        threads: int | None = None,
 2097        sort: bool = False,
 2098        index: bool = False,
 2099        order_by: str | None = None,
 2100        fields_to_rename: dict | None = None,
 2101    ) -> bool:
 2102        """
 2103        The `export_output` function exports data from a VCF file to various formats, including VCF,
 2104        CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and
 2105        partitioning.
 2106
 2107        :param output_file: The `output_file` parameter is a string that specifies the name of the
 2108        output file where the exported data will be saved
 2109        :type output_file: str | None
 2110        :param output_header: The `output_header` parameter is a string that specifies the name of the
 2111        file where the header of the VCF file will be exported. If this parameter is not provided, the
 2112        header will be exported to a file with the same name as the `output_file` parameter, but with
 2113        the extension "
 2114        :type output_header: str | None
 2115        :param export_header: The `export_header` parameter is a boolean flag that determines whether
 2116        the header of a VCF file should be exported to a separate file or not. If `export_header` is
 2117        True, the header will be exported to a file. If `export_header` is False, the header will not
 2118        be, defaults to True
 2119        :type export_header: bool (optional)
 2120        :param query: The `query` parameter in the `export_output` function is an optional SQL query
 2121        that can be used to filter and select specific data from the VCF file before exporting it. If
 2122        provided, only the data that matches the query will be exported. This allows you to customize
 2123        the exported data based on
 2124        :type query: str | None
 2125        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
 2126        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
 2127        organize data in a hierarchical directory structure based on the values of one or more columns.
 2128        This can improve query performance when working with large datasets
 2129        :type parquet_partitions: list | None
 2130        :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when
 2131        exporting data in Parquet format. This parameter is used for partitioning the Parquet file into
 2132        multiple files. It helps in optimizing the export process by breaking down the data into
 2133        manageable chunks for processing and storage
 2134        :type chunk_size: int | None
 2135        :param threads: The `threads` parameter in the `export_output` function specifies the number of
 2136        threads to be used during the export process. It determines the level of parallelism and can
 2137        improve the performance of the export operation. If this parameter is not provided, the function
 2138        will use the default number of threads
 2139        :type threads: int | None
 2140        :param sort: The `sort` parameter in the `export_output` function is a boolean flag that
 2141        determines whether the output file should be sorted based on genomic coordinates of the
 2142        variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to
 2143        `False`,, defaults to False
 2144        :type sort: bool (optional)
 2145        :param index: The `index` parameter in the `export_output` function is a boolean flag that
 2146        determines whether an index should be created on the output file. If `index` is set to `True`,
 2147        an index will be created on the output file. If `index` is set to `False`, no, defaults to False
 2148        :type index: bool (optional)
 2149        :param order_by: The `order_by` parameter in the `export_output` function is a string that
 2150        specifies the column(s) to use for sorting the output file. This parameter is only applicable
 2151        when exporting data in VCF format. It allows you to specify the column(s) based on which the
 2152        output file should be
 2153        :type order_by: str | None
 2154        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the
 2155        mapping of field names to be renamed during the export process. This parameter allows you to
 2156        customize the output field names before exporting the data. Each key-value pair in the
 2157        dictionary represents the original field name as the key and the new field name
 2158        :type fields_to_rename: dict | None
 2159        :return: The `export_output` function returns a boolean value. It checks if the output file
 2160        exists and returns True if it does, or None if it doesn't.
 2161        """
 2162
 2163        # Log
 2164        log.info("Exporting...")
 2165
 2166        # Full path
 2167        output_file = full_path(output_file)
 2168        output_header = full_path(output_header)
 2169
 2170        # Config
 2171        config = self.get_config()
 2172
 2173        # Param
 2174        param = self.get_param()
 2175
 2176        # Tmp files to remove
 2177        tmp_to_remove = []
 2178
 2179        # If no output, get it
 2180        if not output_file:
 2181            output_file = self.get_output()
 2182
 2183        # If not threads
 2184        if not threads:
 2185            threads = self.get_threads()
 2186
 2187        # Rename fields
 2188        if not fields_to_rename:
 2189            fields_to_rename = param.get("export", {}).get("fields_to_rename", None)
 2190        self.rename_info_fields(fields_to_rename=fields_to_rename)
 2191
 2192        # Auto header name with extension
 2193        if export_header or output_header:
 2194            if not output_header:
 2195                output_header = f"{output_file}.hdr"
 2196            # Export header
 2197            self.export_header(output_file=output_file)
 2198
 2199        # Switch off export header if VCF output
 2200        output_file_type = get_file_format(output_file)
 2201        if output_file_type in ["vcf"]:
 2202            export_header = False
 2203            tmp_to_remove.append(output_header)
 2204
 2205        # Chunk size
 2206        if not chunk_size:
 2207            chunk_size = config.get("chunk_size", None)
 2208
 2209        # Parquet partition
 2210        if not parquet_partitions:
 2211            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
 2212        if parquet_partitions and isinstance(parquet_partitions, str):
 2213            parquet_partitions = parquet_partitions.split(",")
 2214
 2215        # Order by
 2216        if not order_by:
 2217            order_by = param.get("export", {}).get("order_by", "")
 2218
 2219        # Header in output
 2220        header_in_output = param.get("export", {}).get("include_header", False)
 2221
 2222        # Database
 2223        database_source = self.get_connexion()
 2224
 2225        # Connexion format
 2226        connexion_format = self.get_connexion_format()
 2227
 2228        # Explode infos
 2229        if self.get_explode_infos():
 2230            self.explode_infos(
 2231                prefix=self.get_explode_infos_prefix(),
 2232                fields=self.get_explode_infos_fields(),
 2233                force=False,
 2234            )
 2235
 2236        # if connexion_format in ["sqlite"] or query:
 2237        if connexion_format in ["sqlite"]:
 2238
 2239            # Export in Parquet
 2240            random_tmp = "".join(
 2241                random.choice(string.ascii_lowercase) for i in range(10)
 2242            )
 2243            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
 2244            tmp_to_remove.append(database_source)
 2245
 2246            # Table Variants
 2247            table_variants = self.get_table_variants()
 2248
 2249            # Create export query
 2250            sql_query_export_subquery = f"""
 2251                SELECT * FROM {table_variants}
 2252                """
 2253
 2254            # Write source file
 2255            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
 2256
 2257        # Create database
 2258        database = Database(
 2259            database=database_source,
 2260            table="variants",
 2261            header_file=output_header,
 2262            conn_config=self.get_connexion_config(),
 2263        )
 2264
 2265        # Existing colomns header
 2266        existing_columns_header = database.get_header_columns_from_database(query=query)
 2267
 2268        # Sample list
 2269        if output_file_type in ["vcf"]:
 2270            get_samples = self.get_samples()
 2271            get_samples_check = self.get_samples_check()
 2272            samples_force = get_samples is not None
 2273            sample_list = self.get_header_sample_list(
 2274                check=get_samples_check,
 2275                samples=get_samples,
 2276                samples_force=samples_force,
 2277            )
 2278        else:
 2279            sample_list = None
 2280
 2281        # Export file
 2282        database.export(
 2283            output_database=output_file,
 2284            output_header=output_header,
 2285            existing_columns_header=existing_columns_header,
 2286            parquet_partitions=parquet_partitions,
 2287            chunk_size=chunk_size,
 2288            threads=threads,
 2289            sort=sort,
 2290            index=index,
 2291            header_in_output=header_in_output,
 2292            order_by=order_by,
 2293            query=query,
 2294            export_header=export_header,
 2295            sample_list=sample_list,
 2296        )
 2297
 2298        # Remove
 2299        remove_if_exists(tmp_to_remove)
 2300
 2301        return (os.path.exists(output_file) or None) and (
 2302            os.path.exists(output_file) or None
 2303        )
 2304
 2305    def get_extra_infos(self, table: str = None) -> list:
 2306        """
 2307        The `get_extra_infos` function returns a list of columns that are in a specified table but not
 2308        in the header.
 2309
 2310        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
 2311        name of the table from which you want to retrieve the extra columns that are not present in the
 2312        header. If the `table` parameter is not provided when calling the function, it will default to
 2313        using the variants
 2314        :type table: str
 2315        :return: A list of columns that are in the specified table but not in the header of the table.
 2316        """
 2317
 2318        header_columns = []
 2319
 2320        if not table:
 2321            table = self.get_table_variants(clause="from")
 2322            header_columns = self.get_header_columns()
 2323
 2324        # Check all columns in the database
 2325        query = f""" SELECT * FROM {table} LIMIT 1 """
 2326        log.debug(f"query {query}")
 2327        table_columns = self.get_query_to_df(query).columns.tolist()
 2328        extra_columns = []
 2329
 2330        # Construct extra infos (not in header)
 2331        for column in table_columns:
 2332            if column not in header_columns:
 2333                extra_columns.append(column)
 2334
 2335        return extra_columns
 2336
 2337    def get_extra_infos_sql(self, table: str = None) -> str:
 2338        """
 2339        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
 2340        by double quotes
 2341
 2342        :param table: The name of the table to get the extra infos from. If None, the default table is
 2343        used
 2344        :type table: str
 2345        :return: A string of the extra infos
 2346        """
 2347
 2348        return ", ".join(
 2349            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
 2350        )
 2351
 2352    def export_header(
 2353        self,
 2354        header_name: str = None,
 2355        output_file: str = None,
 2356        output_file_ext: str = ".hdr",
 2357        clean_header: bool = True,
 2358        remove_chrom_line: bool = False,
 2359    ) -> str:
 2360        """
 2361        The `export_header` function takes a VCF file, extracts the header, modifies it according to
 2362        specified options, and writes it to a new file.
 2363
 2364        :param header_name: The `header_name` parameter is the name of the header file to be created. If
 2365        this parameter is not specified, the header will be written to the output file
 2366        :type header_name: str
 2367        :param output_file: The `output_file` parameter in the `export_header` function is used to
 2368        specify the name of the output file where the header will be written. If this parameter is not
 2369        provided, the header will be written to a temporary file
 2370        :type output_file: str
 2371        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
 2372        string that represents the extension of the output header file. By default, it is set to ".hdr"
 2373        if not specified by the user. This extension will be appended to the `output_file` name to
 2374        create the final, defaults to .hdr
 2375        :type output_file_ext: str (optional)
 2376        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
 2377        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
 2378        `True`, the function will clean the header by modifying certain lines based on a specific
 2379        pattern. If `clean_header`, defaults to True
 2380        :type clean_header: bool (optional)
 2381        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
 2382        boolean flag that determines whether the #CHROM line should be removed from the header before
 2383        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
 2384        defaults to False
 2385        :type remove_chrom_line: bool (optional)
 2386        :return: The function `export_header` returns the name of the temporary header file that is
 2387        created.
 2388        """
 2389
 2390        if not header_name and not output_file:
 2391            output_file = self.get_output()
 2392
 2393        if self.get_header():
 2394
 2395            # Get header object
 2396            header_obj = self.get_header()
 2397
 2398            # Create database
 2399            db_for_header = Database(database=self.get_input())
 2400
 2401            # Get real columns in the file
 2402            db_header_columns = db_for_header.get_columns()
 2403
 2404            with tempfile.TemporaryDirectory() as tmpdir:
 2405
 2406                # Write header file
 2407                header_file_tmp = os.path.join(tmpdir, "header")
 2408                f = open(header_file_tmp, "w")
 2409                vcf.Writer(f, header_obj)
 2410                f.close()
 2411
 2412                # Replace #CHROM line with rel columns
 2413                header_list = db_for_header.read_header_file(
 2414                    header_file=header_file_tmp
 2415                )
 2416                header_list[-1] = "\t".join(db_header_columns)
 2417
 2418                # Remove CHROM line
 2419                if remove_chrom_line:
 2420                    header_list.pop()
 2421
 2422                # Clean header
 2423                if clean_header:
 2424                    header_list_clean = []
 2425                    for head in header_list:
 2426                        # Clean head for malformed header
 2427                        head_clean = head
 2428                        head_clean = re.subn(
 2429                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
 2430                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
 2431                            head_clean,
 2432                            2,
 2433                        )[0]
 2434                        # Write header
 2435                        header_list_clean.append(head_clean)
 2436                    header_list = header_list_clean
 2437
 2438            tmp_header_name = output_file + output_file_ext
 2439
 2440            f = open(tmp_header_name, "w")
 2441            for line in header_list:
 2442                f.write(line)
 2443            f.close()
 2444
 2445        return tmp_header_name
 2446
 2447    def export_variant_vcf(
 2448        self,
 2449        vcf_file,
 2450        remove_info: bool = False,
 2451        add_samples: bool = True,
 2452        list_samples: list = [],
 2453        where_clause: str = "",
 2454        index: bool = False,
 2455        threads: int | None = None,
 2456    ) -> bool | None:
 2457        """
 2458        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
 2459        remove INFO field, add samples, and control compression and indexing.
 2460
 2461        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
 2462        written to. It is the output file that will contain the filtered VCF data based on the specified
 2463        parameters
 2464        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
 2465        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
 2466        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
 2467        in, defaults to False
 2468        :type remove_info: bool (optional)
 2469        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
 2470        the samples should be added to the VCF file or not. If set to True, the samples will be added.
 2471        If set to False, the samples will be removed. The default value is True, defaults to True
 2472        :type add_samples: bool (optional)
 2473        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
 2474        in the output VCF file. By default, all samples will be included. If you provide a list of
 2475        samples, only those samples will be included in the output file
 2476        :type list_samples: list
 2477        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
 2478        determines whether or not to create an index for the output VCF file. If `index` is set to
 2479        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
 2480        :type index: bool (optional)
 2481        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
 2482        number of threads to use for exporting the VCF file. It determines how many parallel threads
 2483        will be used during the export process. More threads can potentially speed up the export process
 2484        by utilizing multiple cores of the processor. If
 2485        :type threads: int | None
 2486        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
 2487        method with various parameters including the output file, query, threads, sort flag, and index
 2488        flag. The `export_output` method is responsible for exporting the VCF data based on the
 2489        specified parameters and configurations provided in the `export_variant_vcf` function.
 2490        """
 2491
 2492        # Config
 2493        config = self.get_config()
 2494
 2495        # Extract VCF
 2496        log.debug("Export VCF...")
 2497
 2498        # Table variants
 2499        table_variants = self.get_table_variants()
 2500
 2501        # Threads
 2502        if not threads:
 2503            threads = self.get_threads()
 2504
 2505        # Info fields
 2506        if remove_info:
 2507            if not isinstance(remove_info, str):
 2508                remove_info = "."
 2509            info_field = f"""'{remove_info}' as INFO"""
 2510        else:
 2511            info_field = "INFO"
 2512
 2513        # Samples fields
 2514        if add_samples:
 2515            if not list_samples:
 2516                list_samples = self.get_header_sample_list()
 2517            if list_samples:
 2518                samples_fields = " , FORMAT , " + " , ".join(
 2519                    [f""" "{sample}" """ for sample in list_samples]
 2520                )
 2521            else:
 2522                samples_fields = ""
 2523            log.debug(f"samples_fields: {samples_fields}")
 2524        else:
 2525            samples_fields = ""
 2526
 2527        # Where clause
 2528        if where_clause is None:
 2529            where_clause = ""
 2530
 2531        # Variants
 2532        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
 2533        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
 2534        log.debug(f"sql_query_select={sql_query_select}")
 2535
 2536        return self.export_output(
 2537            output_file=vcf_file,
 2538            output_header=None,
 2539            export_header=True,
 2540            query=sql_query_select,
 2541            parquet_partitions=None,
 2542            chunk_size=config.get("chunk_size", None),
 2543            threads=threads,
 2544            sort=True,
 2545            index=index,
 2546            order_by=None,
 2547        )
 2548
 2549    def run_commands(self, commands: list = [], threads: int = 1) -> None:
 2550        """
 2551        It takes a list of commands and runs them in parallel using the number of threads specified
 2552
 2553        :param commands: A list of commands to run
 2554        :param threads: The number of threads to use, defaults to 1 (optional)
 2555        """
 2556
 2557        run_parallel_commands(commands, threads)
 2558
 2559    def get_threads(self, default: int = 1) -> int:
 2560        """
 2561        This function returns the number of threads to use for a job, with a default value of 1 if not
 2562        specified.
 2563
 2564        :param default: The `default` parameter in the `get_threads` method is used to specify the
 2565        default number of threads to use if no specific value is provided. If no value is provided for
 2566        the `threads` parameter in the configuration or input parameters, the `default` value will be
 2567        used, defaults to 1
 2568        :type default: int (optional)
 2569        :return: the number of threads to use for the current job.
 2570        """
 2571
 2572        # Config
 2573        config = self.get_config()
 2574
 2575        # Param
 2576        param = self.get_param()
 2577
 2578        # Input threads
 2579        input_thread = param.get("threads", config.get("threads", None))
 2580
 2581        # Check threads
 2582        if not input_thread:
 2583            threads = default
 2584        elif int(input_thread) <= 0:
 2585            threads = os.cpu_count()
 2586        else:
 2587            threads = int(input_thread)
 2588        return threads
 2589
 2590    def get_memory(self, default: str = None) -> str:
 2591        """
 2592        This function retrieves the memory value from parameters or configuration with a default value
 2593        if not found.
 2594
 2595        :param default: The `get_memory` function takes in a default value as a string parameter. This
 2596        default value is used as a fallback in case the `memory` parameter is not provided in the
 2597        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
 2598        the function
 2599        :type default: str
 2600        :return: The `get_memory` function returns a string value representing the memory parameter. If
 2601        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
 2602        return the default value provided as an argument to the function.
 2603        """
 2604
 2605        # Config
 2606        config = self.get_config()
 2607
 2608        # Param
 2609        param = self.get_param()
 2610
 2611        # Input threads
 2612        input_memory = param.get("memory", config.get("memory", None))
 2613
 2614        # Check threads
 2615        if input_memory:
 2616            memory = input_memory
 2617        else:
 2618            memory = default
 2619
 2620        return memory
 2621
 2622    def update_from_vcf(self, vcf_file: str) -> None:
 2623        """
 2624        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
 2625
 2626        :param vcf_file: the path to the VCF file
 2627        """
 2628
 2629        connexion_format = self.get_connexion_format()
 2630
 2631        if connexion_format in ["duckdb"]:
 2632            self.update_from_vcf_duckdb(vcf_file)
 2633        elif connexion_format in ["sqlite"]:
 2634            self.update_from_vcf_sqlite(vcf_file)
 2635
 2636    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
 2637        """
 2638        It takes a VCF file and updates the INFO column of the variants table in the database with the
 2639        INFO column of the VCF file
 2640
 2641        :param vcf_file: the path to the VCF file
 2642        """
 2643
 2644        # varaints table
 2645        table_variants = self.get_table_variants()
 2646
 2647        # Loading VCF into temporaire table
 2648        skip = self.get_header_length(file=vcf_file)
 2649        vcf_df = pd.read_csv(
 2650            vcf_file,
 2651            sep="\t",
 2652            engine="c",
 2653            skiprows=skip,
 2654            header=0,
 2655            low_memory=False,
 2656        )
 2657        sql_query_update = f"""
 2658        UPDATE {table_variants} as table_variants
 2659            SET INFO = concat(
 2660                            CASE
 2661                                WHEN INFO NOT IN ('', '.')
 2662                                THEN INFO
 2663                                ELSE ''
 2664                            END,
 2665                            (
 2666                                SELECT 
 2667                                    concat(
 2668                                        CASE
 2669                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
 2670                                            THEN ';'
 2671                                            ELSE ''
 2672                                        END
 2673                                        ,
 2674                                        CASE
 2675                                            WHEN table_parquet.INFO NOT IN ('','.')
 2676                                            THEN table_parquet.INFO
 2677                                            ELSE ''
 2678                                        END
 2679                                    )
 2680                                FROM vcf_df as table_parquet
 2681                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
 2682                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
 2683                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 2684                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
 2685                                        AND table_parquet.INFO NOT IN ('','.')
 2686                            )
 2687                        )
 2688            ;
 2689            """
 2690        self.conn.execute(sql_query_update)
 2691
 2692    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
 2693        """
 2694        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
 2695        table, then updates the INFO column of the variants table with the INFO column of the temporary
 2696        table
 2697
 2698        :param vcf_file: The path to the VCF file you want to update the database with
 2699        """
 2700
 2701        # Create a temporary table for the VCF
 2702        table_vcf = "tmp_vcf"
 2703        sql_create = (
 2704            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
 2705        )
 2706        self.conn.execute(sql_create)
 2707
 2708        # Loading VCF into temporaire table
 2709        vcf_df = pd.read_csv(
 2710            vcf_file, sep="\t", comment="#", header=None, low_memory=False
 2711        )
 2712        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
 2713        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
 2714
 2715        # Update table 'variants' with VCF data
 2716        # warning: CONCAT as || operator
 2717        sql_query_update = f"""
 2718            UPDATE variants as table_variants
 2719            SET INFO = CASE
 2720                            WHEN INFO NOT IN ('', '.')
 2721                            THEN INFO
 2722                            ELSE ''
 2723                        END ||
 2724                        (
 2725                        SELECT 
 2726                            CASE 
 2727                                WHEN table_variants.INFO NOT IN ('','.') 
 2728                                    AND table_vcf.INFO NOT IN ('','.')  
 2729                                THEN ';' 
 2730                                ELSE '' 
 2731                            END || 
 2732                            CASE 
 2733                                WHEN table_vcf.INFO NOT IN ('','.') 
 2734                                THEN table_vcf.INFO 
 2735                                ELSE '' 
 2736                            END
 2737                        FROM {table_vcf} as table_vcf
 2738                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
 2739                            AND table_vcf.\"POS\" = table_variants.\"POS\"
 2740                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
 2741                            AND table_vcf.\"REF\" = table_variants.\"REF\"
 2742                        )
 2743        """
 2744        self.conn.execute(sql_query_update)
 2745
 2746        # Drop temporary table
 2747        sql_drop = f"DROP TABLE {table_vcf}"
 2748        self.conn.execute(sql_drop)
 2749
 2750    def drop_variants_table(self) -> None:
 2751        """
 2752        > This function drops the variants table
 2753        """
 2754
 2755        table_variants = self.get_table_variants()
 2756        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
 2757        self.conn.execute(sql_table_variants)
 2758
 2759    def set_variant_id(
 2760        self, variant_id_column: str = "variant_id", force: bool = None
 2761    ) -> str:
 2762        """
 2763        It adds a column to the variants table called `variant_id` and populates it with a hash of the
 2764        `#CHROM`, `POS`, `REF`, and `ALT` columns
 2765
 2766        :param variant_id_column: The name of the column to be created in the variants table, defaults
 2767        to variant_id
 2768        :type variant_id_column: str (optional)
 2769        :param force: If True, the variant_id column will be created even if it already exists
 2770        :type force: bool
 2771        :return: The name of the column that contains the variant_id
 2772        """
 2773
 2774        # Assembly
 2775        assembly = self.get_param().get(
 2776            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 2777        )
 2778
 2779        # INFO/Tag prefix
 2780        prefix = self.get_explode_infos_prefix()
 2781
 2782        # Explode INFO/SVTYPE
 2783        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
 2784
 2785        # variants table
 2786        table_variants = self.get_table_variants()
 2787
 2788        # variant_id column
 2789        if not variant_id_column:
 2790            variant_id_column = "variant_id"
 2791
 2792        # Creta variant_id column
 2793        if "variant_id" not in self.get_extra_infos() or force:
 2794
 2795            # Create column
 2796            self.add_column(
 2797                table_name=table_variants,
 2798                column_name=variant_id_column,
 2799                column_type="UBIGINT",
 2800                default_value="0",
 2801            )
 2802
 2803            # Update column
 2804            self.conn.execute(
 2805                f"""
 2806                    UPDATE {table_variants}
 2807                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
 2808                """
 2809            )
 2810
 2811        # Remove added columns
 2812        for added_column in added_columns:
 2813            self.drop_column(column=added_column)
 2814
 2815        # return variant_id column name
 2816        return variant_id_column
 2817
 2818    def get_variant_id_column(
 2819        self, variant_id_column: str = "variant_id", force: bool = None
 2820    ) -> str:
 2821        """
 2822        This function returns the variant_id column name
 2823
 2824        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
 2825        defaults to variant_id
 2826        :type variant_id_column: str (optional)
 2827        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
 2828        False, will only set the variant_id if it is not already set. If None, will set the variant_id
 2829        if it is not already set, or if it is set
 2830        :type force: bool
 2831        :return: The variant_id column name.
 2832        """
 2833
 2834        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
 2835
 2836    ###
 2837    # Annotation
 2838    ###
 2839
 2840    def scan_databases(
 2841        self,
 2842        database_formats: list = ["parquet"],
 2843        database_releases: list = ["current"],
 2844    ) -> dict:
 2845        """
 2846        The function `scan_databases` scans for available databases based on specified formats and
 2847        releases.
 2848
 2849        :param database_formats: The `database_formats` parameter is a list that specifies the formats
 2850        of the databases to be scanned. In this case, the accepted format is "parquet"
 2851        :type database_formats: list ["parquet"]
 2852        :param database_releases: The `database_releases` parameter is a list that specifies the
 2853        releases of the databases to be scanned. In the provided function, the default value for
 2854        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
 2855        databases that are in the "current"
 2856        :type database_releases: list
 2857        :return: The function `scan_databases` returns a dictionary containing information about
 2858        databases that match the specified formats and releases.
 2859        """
 2860
 2861        # Config
 2862        config = self.get_config()
 2863
 2864        # Param
 2865        param = self.get_param()
 2866
 2867        # Param - Assembly
 2868        assembly = param.get("assembly", config.get("assembly", None))
 2869        if not assembly:
 2870            assembly = DEFAULT_ASSEMBLY
 2871            log.warning(f"Default assembly '{assembly}'")
 2872
 2873        # Scan for availabled databases
 2874        log.info(
 2875            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
 2876        )
 2877        databases_infos_dict = databases_infos(
 2878            database_folder_releases=database_releases,
 2879            database_formats=database_formats,
 2880            assembly=assembly,
 2881            config=config,
 2882        )
 2883        log.info(
 2884            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
 2885        )
 2886
 2887        return databases_infos_dict
 2888
 2889    def annotation(self) -> None:
 2890        """
 2891        It annotates the VCF file with the annotations specified in the config file.
 2892        """
 2893
 2894        # Config
 2895        config = self.get_config()
 2896
 2897        # Param
 2898        param = self.get_param()
 2899
 2900        # Param - Assembly
 2901        assembly = param.get("assembly", config.get("assembly", None))
 2902        if not assembly:
 2903            assembly = DEFAULT_ASSEMBLY
 2904            log.warning(f"Default assembly '{assembly}'")
 2905
 2906        # annotations databases folders
 2907        annotations_databases = set(
 2908            config.get("folders", {})
 2909            .get("databases", {})
 2910            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
 2911            + config.get("folders", {})
 2912            .get("databases", {})
 2913            .get("parquet", ["~/howard/databases/parquet/current"])
 2914            + config.get("folders", {})
 2915            .get("databases", {})
 2916            .get("bcftools", ["~/howard/databases/bcftools/current"])
 2917        )
 2918
 2919        # Get param annotations
 2920        if param.get("annotations", None) and isinstance(
 2921            param.get("annotations", None), str
 2922        ):
 2923            log.debug(param.get("annotations", None))
 2924            param_annotation_list = param.get("annotations").split(",")
 2925        else:
 2926            param_annotation_list = []
 2927
 2928        # Each tools param
 2929        if param.get("annotation_parquet", None) != None:
 2930            log.debug(
 2931                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
 2932            )
 2933            if isinstance(param.get("annotation_parquet", None), list):
 2934                param_annotation_list.append(",".join(param.get("annotation_parquet")))
 2935            else:
 2936                param_annotation_list.append(param.get("annotation_parquet"))
 2937        if param.get("annotation_snpsift", None) != None:
 2938            if isinstance(param.get("annotation_snpsift", None), list):
 2939                param_annotation_list.append(
 2940                    "snpsift:"
 2941                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
 2942                )
 2943            else:
 2944                param_annotation_list.append(
 2945                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
 2946                )
 2947        if param.get("annotation_snpeff", None) != None:
 2948            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
 2949        if param.get("annotation_bcftools", None) != None:
 2950            if isinstance(param.get("annotation_bcftools", None), list):
 2951                param_annotation_list.append(
 2952                    "bcftools:"
 2953                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
 2954                )
 2955            else:
 2956                param_annotation_list.append(
 2957                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
 2958                )
 2959        if param.get("annotation_annovar", None) != None:
 2960            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
 2961        if param.get("annotation_exomiser", None) != None:
 2962            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
 2963        if param.get("annotation_splice", None) != None:
 2964            param_annotation_list.append("splice:" + param.get("annotation_splice"))
 2965
 2966        # Merge param annotations list
 2967        param["annotations"] = ",".join(param_annotation_list)
 2968
 2969        # debug
 2970        log.debug(f"param_annotations={param['annotations']}")
 2971
 2972        if param.get("annotations"):
 2973
 2974            # Log
 2975            # log.info("Annotations - Check annotation parameters")
 2976
 2977            if not "annotation" in param:
 2978                param["annotation"] = {}
 2979
 2980            # List of annotations parameters
 2981            annotations_list_input = {}
 2982            if isinstance(param.get("annotations", None), str):
 2983                annotation_file_list = [
 2984                    value for value in param.get("annotations", "").split(",")
 2985                ]
 2986                for annotation_file in annotation_file_list:
 2987                    annotations_list_input[annotation_file.strip()] = {"INFO": None}
 2988            else:
 2989                annotations_list_input = param.get("annotations", {})
 2990
 2991            log.info(f"Quick Annotations:")
 2992            for annotation_key in list(annotations_list_input.keys()):
 2993                log.info(f"   {annotation_key}")
 2994
 2995            # List of annotations and associated fields
 2996            annotations_list = {}
 2997
 2998            for annotation_file in annotations_list_input:
 2999
 3000                # Explode annotations if ALL
 3001                if (
 3002                    annotation_file.upper() == "ALL"
 3003                    or annotation_file.upper().startswith("ALL:")
 3004                ):
 3005
 3006                    # check ALL parameters (formats, releases)
 3007                    annotation_file_split = annotation_file.split(":")
 3008                    database_formats = "parquet"
 3009                    database_releases = "current"
 3010                    for annotation_file_option in annotation_file_split[1:]:
 3011                        database_all_options_split = annotation_file_option.split("=")
 3012                        if database_all_options_split[0] == "format":
 3013                            database_formats = database_all_options_split[1].split("+")
 3014                        if database_all_options_split[0] == "release":
 3015                            database_releases = database_all_options_split[1].split("+")
 3016
 3017                    # Scan for availabled databases
 3018                    databases_infos_dict = self.scan_databases(
 3019                        database_formats=database_formats,
 3020                        database_releases=database_releases,
 3021                    )
 3022
 3023                    # Add found databases in annotation parameters
 3024                    for database_infos in databases_infos_dict.keys():
 3025                        annotations_list[database_infos] = {"INFO": None}
 3026
 3027                else:
 3028                    annotations_list[annotation_file] = annotations_list_input[
 3029                        annotation_file
 3030                    ]
 3031
 3032            # Check each databases
 3033            if len(annotations_list):
 3034
 3035                log.info(
 3036                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
 3037                )
 3038
 3039                for annotation_file in annotations_list:
 3040
 3041                    # Init
 3042                    annotations = annotations_list.get(annotation_file, None)
 3043
 3044                    # Annotation snpEff
 3045                    if annotation_file.startswith("snpeff"):
 3046
 3047                        log.debug(f"Quick Annotation snpEff")
 3048
 3049                        if "snpeff" not in param["annotation"]:
 3050                            param["annotation"]["snpeff"] = {}
 3051
 3052                        if "options" not in param["annotation"]["snpeff"]:
 3053                            param["annotation"]["snpeff"]["options"] = ""
 3054
 3055                        # snpEff options in annotations
 3056                        param["annotation"]["snpeff"]["options"] = "".join(
 3057                            annotation_file.split(":")[1:]
 3058                        )
 3059
 3060                    # Annotation Annovar
 3061                    elif annotation_file.startswith("annovar"):
 3062
 3063                        log.debug(f"Quick Annotation Annovar")
 3064
 3065                        if "annovar" not in param["annotation"]:
 3066                            param["annotation"]["annovar"] = {}
 3067
 3068                        if "annotations" not in param["annotation"]["annovar"]:
 3069                            param["annotation"]["annovar"]["annotations"] = {}
 3070
 3071                        # Options
 3072                        annotation_file_split = annotation_file.split(":")
 3073                        for annotation_file_annotation in annotation_file_split[1:]:
 3074                            if annotation_file_annotation:
 3075                                param["annotation"]["annovar"]["annotations"][
 3076                                    annotation_file_annotation
 3077                                ] = annotations
 3078
 3079                    # Annotation Exomiser
 3080                    elif annotation_file.startswith("exomiser"):
 3081
 3082                        log.debug(f"Quick Annotation Exomiser")
 3083
 3084                        param["annotation"]["exomiser"] = params_string_to_dict(
 3085                            annotation_file
 3086                        )
 3087
 3088                    # Annotation Splice
 3089                    elif annotation_file.startswith("splice"):
 3090
 3091                        log.debug(f"Quick Annotation Splice")
 3092
 3093                        param["annotation"]["splice"] = params_string_to_dict(
 3094                            annotation_file
 3095                        )
 3096
 3097                    # Annotation Parquet or BCFTOOLS
 3098                    else:
 3099
 3100                        # Tools detection
 3101                        if annotation_file.startswith("bcftools:"):
 3102                            annotation_tool_initial = "bcftools"
 3103                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3104                        elif annotation_file.startswith("snpsift:"):
 3105                            annotation_tool_initial = "snpsift"
 3106                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3107                        elif annotation_file.startswith("bigwig:"):
 3108                            annotation_tool_initial = "bigwig"
 3109                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3110                        else:
 3111                            annotation_tool_initial = None
 3112
 3113                        # list of files
 3114                        annotation_file_list = annotation_file.replace("+", ":").split(
 3115                            ":"
 3116                        )
 3117
 3118                        for annotation_file in annotation_file_list:
 3119
 3120                            if annotation_file:
 3121
 3122                                # Annotation tool initial
 3123                                annotation_tool = annotation_tool_initial
 3124
 3125                                # Find file
 3126                                annotation_file_found = None
 3127
 3128                                if os.path.exists(annotation_file):
 3129                                    annotation_file_found = annotation_file
 3130                                elif os.path.exists(full_path(annotation_file)):
 3131                                    annotation_file_found = full_path(annotation_file)
 3132                                else:
 3133                                    # Find within assembly folders
 3134                                    for annotations_database in annotations_databases:
 3135                                        found_files = find_all(
 3136                                            annotation_file,
 3137                                            os.path.join(
 3138                                                annotations_database, assembly
 3139                                            ),
 3140                                        )
 3141                                        if len(found_files) > 0:
 3142                                            annotation_file_found = found_files[0]
 3143                                            break
 3144                                    if not annotation_file_found and not assembly:
 3145                                        # Find within folders
 3146                                        for (
 3147                                            annotations_database
 3148                                        ) in annotations_databases:
 3149                                            found_files = find_all(
 3150                                                annotation_file, annotations_database
 3151                                            )
 3152                                            if len(found_files) > 0:
 3153                                                annotation_file_found = found_files[0]
 3154                                                break
 3155                                log.debug(
 3156                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
 3157                                )
 3158
 3159                                # Full path
 3160                                annotation_file_found = full_path(annotation_file_found)
 3161
 3162                                if annotation_file_found:
 3163
 3164                                    database = Database(database=annotation_file_found)
 3165                                    quick_annotation_format = database.get_format()
 3166                                    quick_annotation_is_compressed = (
 3167                                        database.is_compressed()
 3168                                    )
 3169                                    quick_annotation_is_indexed = os.path.exists(
 3170                                        f"{annotation_file_found}.tbi"
 3171                                    )
 3172                                    bcftools_preference = False
 3173
 3174                                    # Check Annotation Tool
 3175                                    if not annotation_tool:
 3176                                        if (
 3177                                            bcftools_preference
 3178                                            and quick_annotation_format
 3179                                            in ["vcf", "bed"]
 3180                                            and quick_annotation_is_compressed
 3181                                            and quick_annotation_is_indexed
 3182                                        ):
 3183                                            annotation_tool = "bcftools"
 3184                                        elif quick_annotation_format in [
 3185                                            "vcf",
 3186                                            "bed",
 3187                                            "tsv",
 3188                                            "tsv",
 3189                                            "csv",
 3190                                            "json",
 3191                                            "tbl",
 3192                                            "parquet",
 3193                                            "duckdb",
 3194                                        ]:
 3195                                            annotation_tool = "parquet"
 3196                                        elif quick_annotation_format in ["bw"]:
 3197                                            annotation_tool = "bigwig"
 3198                                        else:
 3199                                            log.error(
 3200                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3201                                            )
 3202                                            raise ValueError(
 3203                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3204                                            )
 3205
 3206                                    log.debug(
 3207                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
 3208                                    )
 3209
 3210                                    # Annotation Tool dispatch
 3211                                    if annotation_tool:
 3212                                        if annotation_tool not in param["annotation"]:
 3213                                            param["annotation"][annotation_tool] = {}
 3214                                        if (
 3215                                            "annotations"
 3216                                            not in param["annotation"][annotation_tool]
 3217                                        ):
 3218                                            param["annotation"][annotation_tool][
 3219                                                "annotations"
 3220                                            ] = {}
 3221                                        param["annotation"][annotation_tool][
 3222                                            "annotations"
 3223                                        ][annotation_file_found] = annotations
 3224
 3225                                else:
 3226                                    log.warning(
 3227                                        f"Quick Annotation File {annotation_file} does NOT exist"
 3228                                    )
 3229
 3230                self.set_param(param)
 3231
 3232        if param.get("annotation", None):
 3233            log.info("Annotations")
 3234            if param.get("annotation", {}).get("parquet", None):
 3235                log.info("Annotations 'parquet'...")
 3236                self.annotation_parquet()
 3237            if param.get("annotation", {}).get("bcftools", None):
 3238                log.info("Annotations 'bcftools'...")
 3239                self.annotation_bcftools()
 3240            if param.get("annotation", {}).get("snpsift", None):
 3241                log.info("Annotations 'snpsift'...")
 3242                self.annotation_snpsift()
 3243            if param.get("annotation", {}).get("bigwig", None):
 3244                log.info("Annotations 'bigwig'...")
 3245                self.annotation_bigwig()
 3246            if param.get("annotation", {}).get("annovar", None):
 3247                log.info("Annotations 'annovar'...")
 3248                self.annotation_annovar()
 3249            if param.get("annotation", {}).get("snpeff", None):
 3250                log.info("Annotations 'snpeff'...")
 3251                self.annotation_snpeff()
 3252            if param.get("annotation", {}).get("exomiser", None) is not None:
 3253                log.info("Annotations 'exomiser'...")
 3254                self.annotation_exomiser()
 3255            if param.get("annotation", {}).get("splice", None) is not None:
 3256                log.info("Annotations 'splice' ...")
 3257                self.annotation_splice()
 3258
 3259        # Explode INFOS fields into table fields
 3260        if self.get_explode_infos():
 3261            self.explode_infos(
 3262                prefix=self.get_explode_infos_prefix(),
 3263                fields=self.get_explode_infos_fields(),
 3264                force=True,
 3265            )
 3266
 3267    def annotation_bigwig(self, threads: int = None) -> None:
 3268        """
 3269        The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases.
 3270
 3271        :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the
 3272        number of threads to be used for parallel processing during the annotation process. If the
 3273        `threads` parameter is not provided, the method will attempt to determine the optimal number of
 3274        threads to use based on the system configuration
 3275        :type threads: int
 3276        :return: True
 3277        """
 3278
 3279        # DEBUG
 3280        log.debug("Start annotation with bigwig databases")
 3281
 3282        # # Threads
 3283        # if not threads:
 3284        #     threads = self.get_threads()
 3285        # log.debug("Threads: " + str(threads))
 3286
 3287        # Config
 3288        config = self.get_config()
 3289        log.debug("Config: " + str(config))
 3290
 3291        # Config - BCFTools databases folders
 3292        databases_folders = set(
 3293            self.get_config()
 3294            .get("folders", {})
 3295            .get("databases", {})
 3296            .get("annotations", ["."])
 3297            + self.get_config()
 3298            .get("folders", {})
 3299            .get("databases", {})
 3300            .get("bigwig", ["."])
 3301        )
 3302        log.debug("Databases annotations: " + str(databases_folders))
 3303
 3304        # Param
 3305        annotations = (
 3306            self.get_param()
 3307            .get("annotation", {})
 3308            .get("bigwig", {})
 3309            .get("annotations", None)
 3310        )
 3311        log.debug("Annotations: " + str(annotations))
 3312
 3313        # Assembly
 3314        assembly = self.get_param().get(
 3315            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3316        )
 3317
 3318        # Data
 3319        table_variants = self.get_table_variants()
 3320
 3321        # Check if not empty
 3322        log.debug("Check if not empty")
 3323        sql_query_chromosomes = (
 3324            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3325        )
 3326        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3327        if not sql_query_chromosomes_df["count"][0]:
 3328            log.info(f"VCF empty")
 3329            return
 3330
 3331        # VCF header
 3332        vcf_reader = self.get_header()
 3333        log.debug("Initial header: " + str(vcf_reader.infos))
 3334
 3335        # Existing annotations
 3336        for vcf_annotation in self.get_header().infos:
 3337
 3338            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3339            log.debug(
 3340                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3341            )
 3342
 3343        if annotations:
 3344
 3345            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3346
 3347                # Export VCF file
 3348                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3349
 3350                # annotation_bigwig_config
 3351                annotation_bigwig_config_list = []
 3352
 3353                for annotation in annotations:
 3354                    annotation_fields = annotations[annotation]
 3355
 3356                    # Annotation Name
 3357                    annotation_name = os.path.basename(annotation)
 3358
 3359                    if not annotation_fields:
 3360                        annotation_fields = {"INFO": None}
 3361
 3362                    log.debug(f"Annotation '{annotation_name}'")
 3363                    log.debug(
 3364                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3365                    )
 3366
 3367                    # Create Database
 3368                    database = Database(
 3369                        database=annotation,
 3370                        databases_folders=databases_folders,
 3371                        assembly=assembly,
 3372                    )
 3373
 3374                    # Find files
 3375                    db_file = database.get_database()
 3376                    db_file = full_path(db_file)
 3377                    db_hdr_file = database.get_header_file()
 3378                    db_hdr_file = full_path(db_hdr_file)
 3379                    db_file_type = database.get_format()
 3380
 3381                    # If db_file is http ?
 3382                    if database.get_database().startswith("http"):
 3383
 3384                        # Datbase is HTTP URL
 3385                        db_file_is_http = True
 3386
 3387                        # DB file keep as URL
 3388                        db_file = database.get_database()
 3389                        log.warning(
 3390                            f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)"
 3391                        )
 3392
 3393                        # Retrieve automatic annotation field name
 3394                        annotation_field = clean_annotation_field(
 3395                            os.path.basename(db_file).replace(".bw", "")
 3396                        )
 3397                        log.debug(
 3398                            f"Create header file with annotation field '{annotation_field}' is an HTTP URL"
 3399                        )
 3400
 3401                        # Create automatic header file
 3402                        db_hdr_file = os.path.join(tmp_dir, "header.hdr")
 3403                        with open(db_hdr_file, "w") as f:
 3404                            f.write("##fileformat=VCFv4.2\n")
 3405                            f.write(
 3406                                f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n"""
 3407                            )
 3408                            f.write(f"#CHROM	START	END	{annotation_field}\n")
 3409
 3410                    else:
 3411
 3412                        # Datbase is NOT HTTP URL
 3413                        db_file_is_http = False
 3414
 3415                    # Check index - try to create if not exists
 3416                    if (
 3417                        db_file is None
 3418                        or db_hdr_file is None
 3419                        or (not os.path.exists(db_file) and not db_file_is_http)
 3420                        or not os.path.exists(db_hdr_file)
 3421                        or not db_file_type in ["bw"]
 3422                    ):
 3423                        # if False:
 3424                        log.error("Annotation failed: database not valid")
 3425                        log.error(f"Annotation annotation file: {db_file}")
 3426                        log.error(f"Annotation annotation file type: {db_file_type}")
 3427                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3428                        raise ValueError(
 3429                            f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}"
 3430                        )
 3431                    else:
 3432
 3433                        # Log
 3434                        log.debug(
 3435                            f"Annotation '{annotation}' - file: "
 3436                            + str(db_file)
 3437                            + " and "
 3438                            + str(db_hdr_file)
 3439                        )
 3440
 3441                        # Load header as VCF object
 3442                        db_hdr_vcf = Variants(input=db_hdr_file)
 3443                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3444                        log.debug(
 3445                            "Annotation database header: "
 3446                            + str(db_hdr_vcf_header_infos)
 3447                        )
 3448
 3449                        # For all fields in database
 3450                        annotation_fields_full = False
 3451                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3452                            annotation_fields = {
 3453                                key: key for key in db_hdr_vcf_header_infos
 3454                            }
 3455                            log.debug(
 3456                                "Annotation database header - All annotations added: "
 3457                                + str(annotation_fields)
 3458                            )
 3459                            annotation_fields_full = True
 3460
 3461                        # Init
 3462                        cyvcf2_header_rename_dict = {}
 3463                        cyvcf2_header_list = []
 3464                        cyvcf2_header_indexes = {}
 3465
 3466                        # process annotation fields
 3467                        for annotation_field in annotation_fields:
 3468
 3469                            # New annotation name
 3470                            annotation_field_new = annotation_fields[annotation_field]
 3471
 3472                            # Check annotation field and index in header
 3473                            if (
 3474                                annotation_field
 3475                                in db_hdr_vcf.get_header_columns_as_list()
 3476                            ):
 3477                                annotation_field_index = (
 3478                                    db_hdr_vcf.get_header_columns_as_list().index(
 3479                                        annotation_field
 3480                                    )
 3481                                    - 3
 3482                                )
 3483                                cyvcf2_header_indexes[annotation_field_new] = (
 3484                                    annotation_field_index
 3485                                )
 3486                            else:
 3487                                msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'"
 3488                                log.error(msg_err)
 3489                                raise ValueError(msg_err)
 3490
 3491                            # Append annotation field in cyvcf2 header list
 3492                            cyvcf2_header_rename_dict[annotation_field_new] = (
 3493                                db_hdr_vcf_header_infos[annotation_field].id
 3494                            )
 3495                            cyvcf2_header_list.append(
 3496                                {
 3497                                    "ID": annotation_field_new,
 3498                                    "Number": db_hdr_vcf_header_infos[
 3499                                        annotation_field
 3500                                    ].num,
 3501                                    "Type": db_hdr_vcf_header_infos[
 3502                                        annotation_field
 3503                                    ].type,
 3504                                    "Description": db_hdr_vcf_header_infos[
 3505                                        annotation_field
 3506                                    ].desc,
 3507                                }
 3508                            )
 3509
 3510                            # Add header on VCF
 3511                            vcf_reader.infos[annotation_field_new] = vcf.parser._Info(
 3512                                annotation_field_new,
 3513                                db_hdr_vcf_header_infos[annotation_field].num,
 3514                                db_hdr_vcf_header_infos[annotation_field].type,
 3515                                db_hdr_vcf_header_infos[annotation_field].desc,
 3516                                "HOWARD BigWig annotation",
 3517                                "unknown",
 3518                                self.code_type_map[
 3519                                    db_hdr_vcf_header_infos[annotation_field].type
 3520                                ],
 3521                            )
 3522
 3523                        # Load bigwig database
 3524                        bw_db = pyBigWig.open(db_file)
 3525                        if bw_db.isBigWig():
 3526                            log.debug(f"Database '{db_file}' is in 'BigWig' format")
 3527                        else:
 3528                            msg_err = f"Database '{db_file}' is NOT in 'BigWig' format"
 3529                            log.error(msg_err)
 3530                            raise ValueError(msg_err)
 3531
 3532                        annotation_bigwig_config_list.append(
 3533                            {
 3534                                "db_file": db_file,
 3535                                "bw_db": bw_db,
 3536                                "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict,
 3537                                "cyvcf2_header_list": cyvcf2_header_list,
 3538                                "cyvcf2_header_indexes": cyvcf2_header_indexes,
 3539                            }
 3540                        )
 3541
 3542                # Annotate
 3543                if annotation_bigwig_config_list:
 3544
 3545                    # Annotation config
 3546                    log.debug(
 3547                        f"annotation_bigwig_config={annotation_bigwig_config_list}"
 3548                    )
 3549
 3550                    # Export VCF file
 3551                    self.export_variant_vcf(
 3552                        vcf_file=tmp_vcf_name,
 3553                        remove_info=True,
 3554                        add_samples=False,
 3555                        index=True,
 3556                    )
 3557
 3558                    # Load input tmp file
 3559                    input_vcf = cyvcf2.VCF(tmp_vcf_name)
 3560
 3561                    # Add header in input file
 3562                    for annotation_bigwig_config in annotation_bigwig_config_list:
 3563                        for cyvcf2_header_field in annotation_bigwig_config.get(
 3564                            "cyvcf2_header_list", []
 3565                        ):
 3566                            log.info(
 3567                                f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'"
 3568                            )
 3569                            input_vcf.add_info_to_header(cyvcf2_header_field)
 3570
 3571                    # Create output VCF file
 3572                    output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz")
 3573                    output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf)
 3574
 3575                    # Fetch variants
 3576                    log.info(f"Annotations 'bigwig' start...")
 3577                    for variant in input_vcf:
 3578
 3579                        for annotation_bigwig_config in annotation_bigwig_config_list:
 3580
 3581                            # DB and indexes
 3582                            bw_db = annotation_bigwig_config.get("bw_db", None)
 3583                            cyvcf2_header_indexes = annotation_bigwig_config.get(
 3584                                "cyvcf2_header_indexes", None
 3585                            )
 3586
 3587                            # Retrieve value from chrom pos
 3588                            res = bw_db.values(
 3589                                variant.CHROM, variant.POS - 1, variant.POS
 3590                            )
 3591
 3592                            # For each annotation fields (and indexes)
 3593                            for cyvcf2_header_index in cyvcf2_header_indexes:
 3594
 3595                                # If value is NOT nNone
 3596                                if not np.isnan(
 3597                                    res[cyvcf2_header_indexes[cyvcf2_header_index]]
 3598                                ):
 3599                                    variant.INFO[cyvcf2_header_index] = res[
 3600                                        cyvcf2_header_indexes[cyvcf2_header_index]
 3601                                    ]
 3602
 3603                        # Add record in output file
 3604                        output_vcf.write_record(variant)
 3605
 3606                    # Log
 3607                    log.debug(f"Annotation done.")
 3608
 3609                    # Close and write file
 3610                    log.info(f"Annotations 'bigwig' write...")
 3611                    output_vcf.close()
 3612                    log.debug(f"Write done.")
 3613
 3614                    # Update variants
 3615                    log.info(f"Annotations 'bigwig' update...")
 3616                    self.update_from_vcf(output_vcf_file)
 3617                    log.debug(f"Update done.")
 3618
 3619        return True
 3620
 3621    def annotation_snpsift(self, threads: int = None) -> None:
 3622        """
 3623        This function annotate with bcftools
 3624
 3625        :param threads: Number of threads to use
 3626        :return: the value of the variable "return_value".
 3627        """
 3628
 3629        # DEBUG
 3630        log.debug("Start annotation with bcftools databases")
 3631
 3632        # Threads
 3633        if not threads:
 3634            threads = self.get_threads()
 3635        log.debug("Threads: " + str(threads))
 3636
 3637        # Config
 3638        config = self.get_config()
 3639        log.debug("Config: " + str(config))
 3640
 3641        # Config - snpSift
 3642        snpsift_bin_command = get_bin_command(
 3643            bin="SnpSift.jar",
 3644            tool="snpsift",
 3645            bin_type="jar",
 3646            config=config,
 3647            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 3648        )
 3649        if not snpsift_bin_command:
 3650            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
 3651            log.error(msg_err)
 3652            raise ValueError(msg_err)
 3653
 3654        # Config - bcftools
 3655        bcftools_bin_command = get_bin_command(
 3656            bin="bcftools",
 3657            tool="bcftools",
 3658            bin_type="bin",
 3659            config=config,
 3660            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3661        )
 3662        if not bcftools_bin_command:
 3663            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3664            log.error(msg_err)
 3665            raise ValueError(msg_err)
 3666
 3667        # Config - BCFTools databases folders
 3668        databases_folders = set(
 3669            self.get_config()
 3670            .get("folders", {})
 3671            .get("databases", {})
 3672            .get("annotations", ["."])
 3673            + self.get_config()
 3674            .get("folders", {})
 3675            .get("databases", {})
 3676            .get("bcftools", ["."])
 3677        )
 3678        log.debug("Databases annotations: " + str(databases_folders))
 3679
 3680        # Param
 3681        annotations = (
 3682            self.get_param()
 3683            .get("annotation", {})
 3684            .get("snpsift", {})
 3685            .get("annotations", None)
 3686        )
 3687        log.debug("Annotations: " + str(annotations))
 3688
 3689        # Assembly
 3690        assembly = self.get_param().get(
 3691            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3692        )
 3693
 3694        # Data
 3695        table_variants = self.get_table_variants()
 3696
 3697        # Check if not empty
 3698        log.debug("Check if not empty")
 3699        sql_query_chromosomes = (
 3700            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3701        )
 3702        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3703        if not sql_query_chromosomes_df["count"][0]:
 3704            log.info(f"VCF empty")
 3705            return
 3706
 3707        # VCF header
 3708        vcf_reader = self.get_header()
 3709        log.debug("Initial header: " + str(vcf_reader.infos))
 3710
 3711        # Existing annotations
 3712        for vcf_annotation in self.get_header().infos:
 3713
 3714            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3715            log.debug(
 3716                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3717            )
 3718
 3719        if annotations:
 3720
 3721            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3722
 3723                # Export VCF file
 3724                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3725
 3726                # Init
 3727                commands = {}
 3728
 3729                for annotation in annotations:
 3730                    annotation_fields = annotations[annotation]
 3731
 3732                    # Annotation Name
 3733                    annotation_name = os.path.basename(annotation)
 3734
 3735                    if not annotation_fields:
 3736                        annotation_fields = {"INFO": None}
 3737
 3738                    log.debug(f"Annotation '{annotation_name}'")
 3739                    log.debug(
 3740                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3741                    )
 3742
 3743                    # Create Database
 3744                    database = Database(
 3745                        database=annotation,
 3746                        databases_folders=databases_folders,
 3747                        assembly=assembly,
 3748                    )
 3749
 3750                    # Find files
 3751                    db_file = database.get_database()
 3752                    db_file = full_path(db_file)
 3753                    db_hdr_file = database.get_header_file()
 3754                    db_hdr_file = full_path(db_hdr_file)
 3755                    db_file_type = database.get_format()
 3756                    db_tbi_file = f"{db_file}.tbi"
 3757                    db_file_compressed = database.is_compressed()
 3758
 3759                    # Check if compressed
 3760                    if not db_file_compressed:
 3761                        log.error(
 3762                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3763                        )
 3764                        raise ValueError(
 3765                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3766                        )
 3767
 3768                    # Check if indexed
 3769                    if not os.path.exists(db_tbi_file):
 3770                        log.error(
 3771                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3772                        )
 3773                        raise ValueError(
 3774                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3775                        )
 3776
 3777                    # Check index - try to create if not exists
 3778                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3779                        log.error("Annotation failed: database not valid")
 3780                        log.error(f"Annotation annotation file: {db_file}")
 3781                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3782                        log.error(f"Annotation annotation index: {db_tbi_file}")
 3783                        raise ValueError(
 3784                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3785                        )
 3786                    else:
 3787
 3788                        log.debug(
 3789                            f"Annotation '{annotation}' - file: "
 3790                            + str(db_file)
 3791                            + " and "
 3792                            + str(db_hdr_file)
 3793                        )
 3794
 3795                        # Load header as VCF object
 3796                        db_hdr_vcf = Variants(input=db_hdr_file)
 3797                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3798                        log.debug(
 3799                            "Annotation database header: "
 3800                            + str(db_hdr_vcf_header_infos)
 3801                        )
 3802
 3803                        # For all fields in database
 3804                        annotation_fields_full = False
 3805                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3806                            annotation_fields = {
 3807                                key: key for key in db_hdr_vcf_header_infos
 3808                            }
 3809                            log.debug(
 3810                                "Annotation database header - All annotations added: "
 3811                                + str(annotation_fields)
 3812                            )
 3813                            annotation_fields_full = True
 3814
 3815                        # # Create file for field rename
 3816                        # log.debug("Create file for field rename")
 3817                        # tmp_rename = NamedTemporaryFile(
 3818                        #     prefix=self.get_prefix(),
 3819                        #     dir=self.get_tmp_dir(),
 3820                        #     suffix=".rename",
 3821                        #     delete=False,
 3822                        # )
 3823                        # tmp_rename_name = tmp_rename.name
 3824                        # tmp_files.append(tmp_rename_name)
 3825
 3826                        # Number of fields
 3827                        nb_annotation_field = 0
 3828                        annotation_list = []
 3829                        annotation_infos_rename_list = []
 3830
 3831                        for annotation_field in annotation_fields:
 3832
 3833                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3834                            annotation_fields_new_name = annotation_fields.get(
 3835                                annotation_field, annotation_field
 3836                            )
 3837                            if not annotation_fields_new_name:
 3838                                annotation_fields_new_name = annotation_field
 3839
 3840                            # Check if field is in DB and if field is not elready in input data
 3841                            if (
 3842                                annotation_field in db_hdr_vcf.get_header().infos
 3843                                and annotation_fields_new_name
 3844                                not in self.get_header().infos
 3845                            ):
 3846
 3847                                log.info(
 3848                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3849                                )
 3850
 3851                                # BCFTools annotate param to rename fields
 3852                                if annotation_field != annotation_fields_new_name:
 3853                                    annotation_infos_rename_list.append(
 3854                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3855                                    )
 3856
 3857                                # Add INFO field to header
 3858                                db_hdr_vcf_header_infos_number = (
 3859                                    db_hdr_vcf_header_infos[annotation_field].num or "."
 3860                                )
 3861                                db_hdr_vcf_header_infos_type = (
 3862                                    db_hdr_vcf_header_infos[annotation_field].type
 3863                                    or "String"
 3864                                )
 3865                                db_hdr_vcf_header_infos_description = (
 3866                                    db_hdr_vcf_header_infos[annotation_field].desc
 3867                                    or f"{annotation_field} description"
 3868                                )
 3869                                db_hdr_vcf_header_infos_source = (
 3870                                    db_hdr_vcf_header_infos[annotation_field].source
 3871                                    or "unknown"
 3872                                )
 3873                                db_hdr_vcf_header_infos_version = (
 3874                                    db_hdr_vcf_header_infos[annotation_field].version
 3875                                    or "unknown"
 3876                                )
 3877
 3878                                vcf_reader.infos[annotation_fields_new_name] = (
 3879                                    vcf.parser._Info(
 3880                                        annotation_fields_new_name,
 3881                                        db_hdr_vcf_header_infos_number,
 3882                                        db_hdr_vcf_header_infos_type,
 3883                                        db_hdr_vcf_header_infos_description,
 3884                                        db_hdr_vcf_header_infos_source,
 3885                                        db_hdr_vcf_header_infos_version,
 3886                                        self.code_type_map[
 3887                                            db_hdr_vcf_header_infos_type
 3888                                        ],
 3889                                    )
 3890                                )
 3891
 3892                                annotation_list.append(annotation_field)
 3893
 3894                                nb_annotation_field += 1
 3895
 3896                            else:
 3897
 3898                                if (
 3899                                    annotation_field
 3900                                    not in db_hdr_vcf.get_header().infos
 3901                                ):
 3902                                    log.warning(
 3903                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
 3904                                    )
 3905                                if (
 3906                                    annotation_fields_new_name
 3907                                    in self.get_header().infos
 3908                                ):
 3909                                    log.warning(
 3910                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3911                                    )
 3912
 3913                        log.info(
 3914                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3915                        )
 3916
 3917                        annotation_infos = ",".join(annotation_list)
 3918
 3919                        if annotation_infos != "":
 3920
 3921                            # Annotated VCF (and error file)
 3922                            tmp_annotation_vcf_name = os.path.join(
 3923                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
 3924                            )
 3925                            tmp_annotation_vcf_name_err = (
 3926                                tmp_annotation_vcf_name + ".err"
 3927                            )
 3928
 3929                            # Add fields to annotate
 3930                            if not annotation_fields_full:
 3931                                annotation_infos_option = f"-info {annotation_infos}"
 3932                            else:
 3933                                annotation_infos_option = ""
 3934
 3935                            # Info fields rename
 3936                            if annotation_infos_rename_list:
 3937                                annotation_infos_rename = " -c " + ",".join(
 3938                                    annotation_infos_rename_list
 3939                                )
 3940                            else:
 3941                                annotation_infos_rename = ""
 3942
 3943                            # Annotate command
 3944                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3945
 3946                            # Add command
 3947                            commands[command_annotate] = tmp_annotation_vcf_name
 3948
 3949                if commands:
 3950
 3951                    # Export VCF file
 3952                    self.export_variant_vcf(
 3953                        vcf_file=tmp_vcf_name,
 3954                        remove_info=True,
 3955                        add_samples=False,
 3956                        index=True,
 3957                    )
 3958                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
 3959
 3960                    # Num command
 3961                    nb_command = 0
 3962
 3963                    # Annotate
 3964                    for command_annotate in commands:
 3965                        nb_command += 1
 3966                        log.info(
 3967                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
 3968                        )
 3969                        log.debug(f"command_annotate={command_annotate}")
 3970                        run_parallel_commands([command_annotate], threads)
 3971
 3972                        # Debug
 3973                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
 3974
 3975                        # Update variants
 3976                        log.info(
 3977                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
 3978                        )
 3979                        self.update_from_vcf(commands[command_annotate])
 3980
 3981    def annotation_bcftools(self, threads: int = None) -> None:
 3982        """
 3983        This function annotate with bcftools
 3984
 3985        :param threads: Number of threads to use
 3986        :return: the value of the variable "return_value".
 3987        """
 3988
 3989        # DEBUG
 3990        log.debug("Start annotation with bcftools databases")
 3991
 3992        # Threads
 3993        if not threads:
 3994            threads = self.get_threads()
 3995        log.debug("Threads: " + str(threads))
 3996
 3997        # Config
 3998        config = self.get_config()
 3999        log.debug("Config: " + str(config))
 4000
 4001        # DEBUG
 4002        delete_tmp = True
 4003        if self.get_config().get("verbosity", "warning") in ["debug"]:
 4004            delete_tmp = False
 4005            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 4006
 4007        # Config - BCFTools bin command
 4008        bcftools_bin_command = get_bin_command(
 4009            bin="bcftools",
 4010            tool="bcftools",
 4011            bin_type="bin",
 4012            config=config,
 4013            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 4014        )
 4015        if not bcftools_bin_command:
 4016            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 4017            log.error(msg_err)
 4018            raise ValueError(msg_err)
 4019
 4020        # Config - BCFTools databases folders
 4021        databases_folders = set(
 4022            self.get_config()
 4023            .get("folders", {})
 4024            .get("databases", {})
 4025            .get("annotations", ["."])
 4026            + self.get_config()
 4027            .get("folders", {})
 4028            .get("databases", {})
 4029            .get("bcftools", ["."])
 4030        )
 4031        log.debug("Databases annotations: " + str(databases_folders))
 4032
 4033        # Param
 4034        annotations = (
 4035            self.get_param()
 4036            .get("annotation", {})
 4037            .get("bcftools", {})
 4038            .get("annotations", None)
 4039        )
 4040        log.debug("Annotations: " + str(annotations))
 4041
 4042        # Assembly
 4043        assembly = self.get_param().get(
 4044            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 4045        )
 4046
 4047        # Data
 4048        table_variants = self.get_table_variants()
 4049
 4050        # Check if not empty
 4051        log.debug("Check if not empty")
 4052        sql_query_chromosomes = (
 4053            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4054        )
 4055        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 4056        if not sql_query_chromosomes_df["count"][0]:
 4057            log.info(f"VCF empty")
 4058            return
 4059
 4060        # Export in VCF
 4061        log.debug("Create initial file to annotate")
 4062        tmp_vcf = NamedTemporaryFile(
 4063            prefix=self.get_prefix(),
 4064            dir=self.get_tmp_dir(),
 4065            suffix=".vcf.gz",
 4066            delete=False,
 4067        )
 4068        tmp_vcf_name = tmp_vcf.name
 4069
 4070        # VCF header
 4071        vcf_reader = self.get_header()
 4072        log.debug("Initial header: " + str(vcf_reader.infos))
 4073
 4074        # Existing annotations
 4075        for vcf_annotation in self.get_header().infos:
 4076
 4077            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 4078            log.debug(
 4079                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 4080            )
 4081
 4082        if annotations:
 4083
 4084            tmp_ann_vcf_list = []
 4085            commands = []
 4086            tmp_files = []
 4087            err_files = []
 4088
 4089            for annotation in annotations:
 4090                annotation_fields = annotations[annotation]
 4091
 4092                # Annotation Name
 4093                annotation_name = os.path.basename(annotation)
 4094
 4095                if not annotation_fields:
 4096                    annotation_fields = {"INFO": None}
 4097
 4098                log.debug(f"Annotation '{annotation_name}'")
 4099                log.debug(
 4100                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 4101                )
 4102
 4103                # Create Database
 4104                database = Database(
 4105                    database=annotation,
 4106                    databases_folders=databases_folders,
 4107                    assembly=assembly,
 4108                )
 4109
 4110                # Find files
 4111                db_file = database.get_database()
 4112                db_file = full_path(db_file)
 4113                db_hdr_file = database.get_header_file()
 4114                db_hdr_file = full_path(db_hdr_file)
 4115                db_file_type = database.get_format()
 4116                db_tbi_file = f"{db_file}.tbi"
 4117                db_file_compressed = database.is_compressed()
 4118
 4119                # Check if compressed
 4120                if not db_file_compressed:
 4121                    log.error(
 4122                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4123                    )
 4124                    raise ValueError(
 4125                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4126                    )
 4127
 4128                # Check if indexed
 4129                if not os.path.exists(db_tbi_file):
 4130                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
 4131                    raise ValueError(
 4132                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
 4133                    )
 4134
 4135                # Check index - try to create if not exists
 4136                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 4137                    log.error("Annotation failed: database not valid")
 4138                    log.error(f"Annotation annotation file: {db_file}")
 4139                    log.error(f"Annotation annotation header: {db_hdr_file}")
 4140                    log.error(f"Annotation annotation index: {db_tbi_file}")
 4141                    raise ValueError(
 4142                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 4143                    )
 4144                else:
 4145
 4146                    log.debug(
 4147                        f"Annotation '{annotation}' - file: "
 4148                        + str(db_file)
 4149                        + " and "
 4150                        + str(db_hdr_file)
 4151                    )
 4152
 4153                    # Load header as VCF object
 4154                    db_hdr_vcf = Variants(input=db_hdr_file)
 4155                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 4156                    log.debug(
 4157                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
 4158                    )
 4159
 4160                    # For all fields in database
 4161                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 4162                        annotation_fields = {
 4163                            key: key for key in db_hdr_vcf_header_infos
 4164                        }
 4165                        log.debug(
 4166                            "Annotation database header - All annotations added: "
 4167                            + str(annotation_fields)
 4168                        )
 4169
 4170                    # Number of fields
 4171                    nb_annotation_field = 0
 4172                    annotation_list = []
 4173
 4174                    for annotation_field in annotation_fields:
 4175
 4176                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 4177                        annotation_fields_new_name = annotation_fields.get(
 4178                            annotation_field, annotation_field
 4179                        )
 4180                        if not annotation_fields_new_name:
 4181                            annotation_fields_new_name = annotation_field
 4182
 4183                        # Check if field is in DB and if field is not elready in input data
 4184                        if (
 4185                            annotation_field in db_hdr_vcf.get_header().infos
 4186                            and annotation_fields_new_name
 4187                            not in self.get_header().infos
 4188                        ):
 4189
 4190                            log.info(
 4191                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 4192                            )
 4193
 4194                            # Add INFO field to header
 4195                            db_hdr_vcf_header_infos_number = (
 4196                                db_hdr_vcf_header_infos[annotation_field].num or "."
 4197                            )
 4198                            db_hdr_vcf_header_infos_type = (
 4199                                db_hdr_vcf_header_infos[annotation_field].type
 4200                                or "String"
 4201                            )
 4202                            db_hdr_vcf_header_infos_description = (
 4203                                db_hdr_vcf_header_infos[annotation_field].desc
 4204                                or f"{annotation_field} description"
 4205                            )
 4206                            db_hdr_vcf_header_infos_source = (
 4207                                db_hdr_vcf_header_infos[annotation_field].source
 4208                                or "unknown"
 4209                            )
 4210                            db_hdr_vcf_header_infos_version = (
 4211                                db_hdr_vcf_header_infos[annotation_field].version
 4212                                or "unknown"
 4213                            )
 4214
 4215                            vcf_reader.infos[annotation_fields_new_name] = (
 4216                                vcf.parser._Info(
 4217                                    annotation_fields_new_name,
 4218                                    db_hdr_vcf_header_infos_number,
 4219                                    db_hdr_vcf_header_infos_type,
 4220                                    db_hdr_vcf_header_infos_description,
 4221                                    db_hdr_vcf_header_infos_source,
 4222                                    db_hdr_vcf_header_infos_version,
 4223                                    self.code_type_map[db_hdr_vcf_header_infos_type],
 4224                                )
 4225                            )
 4226
 4227                            # annotation_list.append(annotation_field)
 4228                            if annotation_field != annotation_fields_new_name:
 4229                                annotation_list.append(
 4230                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 4231                                )
 4232                            else:
 4233                                annotation_list.append(annotation_field)
 4234
 4235                            nb_annotation_field += 1
 4236
 4237                        else:
 4238
 4239                            if annotation_field not in db_hdr_vcf.get_header().infos:
 4240                                log.warning(
 4241                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
 4242                                )
 4243                            if annotation_fields_new_name in self.get_header().infos:
 4244                                log.warning(
 4245                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 4246                                )
 4247
 4248                    log.info(
 4249                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 4250                    )
 4251
 4252                    annotation_infos = ",".join(annotation_list)
 4253
 4254                    if annotation_infos != "":
 4255
 4256                        # Protect header for bcftools (remove "#CHROM" and variants line)
 4257                        log.debug("Protect Header file - remove #CHROM line if exists")
 4258                        tmp_header_vcf = NamedTemporaryFile(
 4259                            prefix=self.get_prefix(),
 4260                            dir=self.get_tmp_dir(),
 4261                            suffix=".hdr",
 4262                            delete=False,
 4263                        )
 4264                        tmp_header_vcf_name = tmp_header_vcf.name
 4265                        tmp_files.append(tmp_header_vcf_name)
 4266                        # Command
 4267                        if db_hdr_file.endswith(".gz"):
 4268                            command_extract_header = f"zcat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4269                        else:
 4270                            command_extract_header = f"cat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4271                        # Run
 4272                        run_parallel_commands([command_extract_header], 1)
 4273
 4274                        # Find chomosomes
 4275                        log.debug("Find chromosomes ")
 4276                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
 4277                        sql_query_chromosomes_df = self.get_query_to_df(
 4278                            sql_query_chromosomes
 4279                        )
 4280                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
 4281
 4282                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
 4283
 4284                        # BED columns in the annotation file
 4285                        if db_file_type in ["bed"]:
 4286                            annotation_infos = "CHROM,POS,POS," + annotation_infos
 4287
 4288                        for chrom in chomosomes_list:
 4289
 4290                            # Create BED on initial VCF
 4291                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
 4292                            tmp_bed = NamedTemporaryFile(
 4293                                prefix=self.get_prefix(),
 4294                                dir=self.get_tmp_dir(),
 4295                                suffix=".bed",
 4296                                delete=False,
 4297                            )
 4298                            tmp_bed_name = tmp_bed.name
 4299                            tmp_files.append(tmp_bed_name)
 4300
 4301                            # Detecte regions
 4302                            log.debug(
 4303                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
 4304                            )
 4305                            window = 1000000
 4306                            sql_query_intervals_for_bed = f"""
 4307                                SELECT  \"#CHROM\",
 4308                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
 4309                                        \"POS\"+{window}
 4310                                FROM {table_variants} as table_variants
 4311                                WHERE table_variants.\"#CHROM\" = '{chrom}'
 4312                            """
 4313                            regions = self.conn.execute(
 4314                                sql_query_intervals_for_bed
 4315                            ).fetchall()
 4316                            merged_regions = merge_regions(regions)
 4317                            log.debug(
 4318                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
 4319                            )
 4320
 4321                            header = ["#CHROM", "START", "END"]
 4322                            with open(tmp_bed_name, "w") as f:
 4323                                # Write the header with tab delimiter
 4324                                f.write("\t".join(header) + "\n")
 4325                                for d in merged_regions:
 4326                                    # Write each data row with tab delimiter
 4327                                    f.write("\t".join(map(str, d)) + "\n")
 4328
 4329                            # Tmp files
 4330                            tmp_annotation_vcf = NamedTemporaryFile(
 4331                                prefix=self.get_prefix(),
 4332                                dir=self.get_tmp_dir(),
 4333                                suffix=".vcf.gz",
 4334                                delete=False,
 4335                            )
 4336                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
 4337                            tmp_files.append(tmp_annotation_vcf_name)
 4338                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
 4339                            tmp_annotation_vcf_name_err = (
 4340                                tmp_annotation_vcf_name + ".err"
 4341                            )
 4342                            err_files.append(tmp_annotation_vcf_name_err)
 4343
 4344                            # Annotate Command
 4345                            log.debug(
 4346                                f"Annotation '{annotation}' - add bcftools command"
 4347                            )
 4348
 4349                            # Command
 4350                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 4351
 4352                            # Add command
 4353                            commands.append(command_annotate)
 4354
 4355            # if some commands
 4356            if commands:
 4357
 4358                # Export VCF file
 4359                self.export_variant_vcf(
 4360                    vcf_file=tmp_vcf_name,
 4361                    remove_info=True,
 4362                    add_samples=False,
 4363                    index=True,
 4364                )
 4365
 4366                # Threads
 4367                # calculate threads for annotated commands
 4368                if commands:
 4369                    threads_bcftools_annotate = round(threads / len(commands))
 4370                else:
 4371                    threads_bcftools_annotate = 1
 4372
 4373                if not threads_bcftools_annotate:
 4374                    threads_bcftools_annotate = 1
 4375
 4376                # Add threads option to bcftools commands
 4377                if threads_bcftools_annotate > 1:
 4378                    commands_threaded = []
 4379                    for command in commands:
 4380                        commands_threaded.append(
 4381                            command.replace(
 4382                                f"{bcftools_bin_command} annotate ",
 4383                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
 4384                            )
 4385                        )
 4386                    commands = commands_threaded
 4387
 4388                # Command annotation multithreading
 4389                log.debug(f"Annotation - Annotation commands: " + str(commands))
 4390                log.info(
 4391                    f"Annotation - Annotation multithreaded in "
 4392                    + str(len(commands))
 4393                    + " commands"
 4394                )
 4395
 4396                run_parallel_commands(commands, threads)
 4397
 4398                # Merge
 4399                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
 4400
 4401                if tmp_ann_vcf_list_cmd:
 4402
 4403                    # Tmp file
 4404                    tmp_annotate_vcf = NamedTemporaryFile(
 4405                        prefix=self.get_prefix(),
 4406                        dir=self.get_tmp_dir(),
 4407                        suffix=".vcf.gz",
 4408                        delete=True,
 4409                    )
 4410                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
 4411                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 4412                    err_files.append(tmp_annotate_vcf_name_err)
 4413
 4414                    # Tmp file remove command
 4415                    tmp_files_remove_command = ""
 4416                    if tmp_files:
 4417                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
 4418
 4419                    # Command merge
 4420                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
 4421                    log.info(
 4422                        f"Annotation - Annotation merging "
 4423                        + str(len(commands))
 4424                        + " annotated files"
 4425                    )
 4426                    log.debug(f"Annotation - merge command: {merge_command}")
 4427                    run_parallel_commands([merge_command], 1)
 4428
 4429                    # Error messages
 4430                    log.info(f"Error/Warning messages:")
 4431                    error_message_command_all = []
 4432                    error_message_command_warning = []
 4433                    error_message_command_err = []
 4434                    for err_file in err_files:
 4435                        with open(err_file, "r") as f:
 4436                            for line in f:
 4437                                message = line.strip()
 4438                                error_message_command_all.append(message)
 4439                                if line.startswith("[W::"):
 4440                                    error_message_command_warning.append(message)
 4441                                if line.startswith("[E::"):
 4442                                    error_message_command_err.append(
 4443                                        f"{err_file}: " + message
 4444                                    )
 4445                    # log info
 4446                    for message in list(
 4447                        set(error_message_command_err + error_message_command_warning)
 4448                    ):
 4449                        log.info(f"   {message}")
 4450                    # debug info
 4451                    for message in list(set(error_message_command_all)):
 4452                        log.debug(f"   {message}")
 4453                    # failed
 4454                    if len(error_message_command_err):
 4455                        log.error("Annotation failed: Error in commands")
 4456                        raise ValueError("Annotation failed: Error in commands")
 4457
 4458                    # Update variants
 4459                    log.info(f"Annotation - Updating...")
 4460                    self.update_from_vcf(tmp_annotate_vcf_name)
 4461
 4462    def annotation_exomiser(self, threads: int = None) -> None:
 4463        """
 4464        This function annotate with Exomiser
 4465
 4466        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
 4467        - "analysis" (dict/file):
 4468            Full analysis dictionnary parameters (see Exomiser docs).
 4469            Either a dict, or a file in JSON or YAML format.
 4470            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
 4471            Default : None
 4472        - "preset" (string):
 4473            Analysis preset (available in config folder).
 4474            Used if no full "analysis" is provided.
 4475            Default: "exome"
 4476        - "phenopacket" (dict/file):
 4477            Samples and phenotipic features parameters (see Exomiser docs).
 4478            Either a dict, or a file in JSON or YAML format.
 4479            Default: None
 4480        - "subject" (dict):
 4481            Sample parameters (see Exomiser docs).
 4482            Example:
 4483                "subject":
 4484                    {
 4485                        "id": "ISDBM322017",
 4486                        "sex": "FEMALE"
 4487                    }
 4488            Default: None
 4489        - "sample" (string):
 4490            Sample name to construct "subject" section:
 4491                "subject":
 4492                    {
 4493                        "id": "<sample>",
 4494                        "sex": "UNKNOWN_SEX"
 4495                    }
 4496            Default: None
 4497        - "phenotypicFeatures" (dict)
 4498            Phenotypic features to construct "subject" section.
 4499            Example:
 4500                "phenotypicFeatures":
 4501                    [
 4502                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
 4503                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
 4504                    ]
 4505        - "hpo" (list)
 4506            List of HPO ids as phenotypic features.
 4507            Example:
 4508                "hpo": ['0001156', '0001363', '0011304', '0010055']
 4509            Default: []
 4510        - "outputOptions" (dict):
 4511            Output options (see Exomiser docs).
 4512            Default:
 4513                "output_options" =
 4514                    {
 4515                        "outputContributingVariantsOnly": False,
 4516                        "numGenes": 0,
 4517                        "outputFormats": ["TSV_VARIANT", "VCF"]
 4518                    }
 4519        - "transcript_source" (string):
 4520            Transcript source (either "refseq", "ucsc", "ensembl")
 4521            Default: "refseq"
 4522        - "exomiser_to_info" (boolean):
 4523            Add exomiser TSV file columns as INFO fields in VCF.
 4524            Default: False
 4525        - "release" (string):
 4526            Exomise database release.
 4527            If not exists, database release will be downloaded (take a while).
 4528            Default: None (provided by application.properties configuration file)
 4529        - "exomiser_application_properties" (file):
 4530            Exomiser configuration file (see Exomiser docs).
 4531            Useful to automatically download databases (especially for specific genome databases).
 4532
 4533        Notes:
 4534        - If no sample in parameters, first sample in VCF will be chosen
 4535        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
 4536
 4537        :param threads: The number of threads to use
 4538        :return: None.
 4539        """
 4540
 4541        # DEBUG
 4542        log.debug("Start annotation with Exomiser databases")
 4543
 4544        # Threads
 4545        if not threads:
 4546            threads = self.get_threads()
 4547        log.debug("Threads: " + str(threads))
 4548
 4549        # Config
 4550        config = self.get_config()
 4551        log.debug("Config: " + str(config))
 4552
 4553        # Config - Folders - Databases
 4554        databases_folders = (
 4555            config.get("folders", {})
 4556            .get("databases", {})
 4557            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
 4558        )
 4559        databases_folders = full_path(databases_folders)
 4560        if not os.path.exists(databases_folders):
 4561            log.error(f"Databases annotations: {databases_folders} NOT found")
 4562        log.debug("Databases annotations: " + str(databases_folders))
 4563
 4564        # Config - Exomiser
 4565        exomiser_bin_command = get_bin_command(
 4566            bin="exomiser-cli*.jar",
 4567            tool="exomiser",
 4568            bin_type="jar",
 4569            config=config,
 4570            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
 4571        )
 4572        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
 4573        if not exomiser_bin_command:
 4574            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
 4575            log.error(msg_err)
 4576            raise ValueError(msg_err)
 4577
 4578        # Param
 4579        param = self.get_param()
 4580        log.debug("Param: " + str(param))
 4581
 4582        # Param - Exomiser
 4583        param_exomiser = param.get("annotation", {}).get("exomiser", {})
 4584        log.debug(f"Param Exomiser: {param_exomiser}")
 4585
 4586        # Param - Assembly
 4587        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4588        log.debug("Assembly: " + str(assembly))
 4589
 4590        # Data
 4591        table_variants = self.get_table_variants()
 4592
 4593        # Check if not empty
 4594        log.debug("Check if not empty")
 4595        sql_query_chromosomes = (
 4596            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4597        )
 4598        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4599            log.info(f"VCF empty")
 4600            return False
 4601
 4602        # VCF header
 4603        vcf_reader = self.get_header()
 4604        log.debug("Initial header: " + str(vcf_reader.infos))
 4605
 4606        # Samples
 4607        samples = self.get_header_sample_list()
 4608        if not samples:
 4609            log.error("No Samples in VCF")
 4610            return False
 4611        log.debug(f"Samples: {samples}")
 4612
 4613        # Memory limit
 4614        memory_limit = self.get_memory("8G")
 4615        log.debug(f"memory_limit: {memory_limit}")
 4616
 4617        # Exomiser java options
 4618        exomiser_java_options = (
 4619            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4620        )
 4621        log.debug(f"Exomiser java options: {exomiser_java_options}")
 4622
 4623        # Download Exomiser (if not exists)
 4624        exomiser_release = param_exomiser.get("release", None)
 4625        exomiser_application_properties = param_exomiser.get(
 4626            "exomiser_application_properties", None
 4627        )
 4628        databases_download_exomiser(
 4629            assemblies=[assembly],
 4630            exomiser_folder=databases_folders,
 4631            exomiser_release=exomiser_release,
 4632            exomiser_phenotype_release=exomiser_release,
 4633            exomiser_application_properties=exomiser_application_properties,
 4634        )
 4635
 4636        # Force annotation
 4637        force_update_annotation = True
 4638
 4639        if "Exomiser" not in self.get_header().infos or force_update_annotation:
 4640            log.debug("Start annotation Exomiser")
 4641
 4642            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 4643
 4644                # tmp_dir = "/tmp/exomiser"
 4645
 4646                ### ANALYSIS ###
 4647                ################
 4648
 4649                # Create analysis.json through analysis dict
 4650                # either analysis in param or by default
 4651                # depending on preset exome/genome)
 4652
 4653                # Init analysis dict
 4654                param_exomiser_analysis_dict = {}
 4655
 4656                # analysis from param
 4657                param_exomiser_analysis = param_exomiser.get("analysis", {})
 4658                param_exomiser_analysis = full_path(param_exomiser_analysis)
 4659
 4660                # If analysis in param -> load anlaysis json
 4661                if param_exomiser_analysis:
 4662
 4663                    # If param analysis is a file and exists
 4664                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
 4665                        param_exomiser_analysis
 4666                    ):
 4667                        # Load analysis file into analysis dict (either yaml or json)
 4668                        with open(param_exomiser_analysis) as json_file:
 4669                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
 4670
 4671                    # If param analysis is a dict
 4672                    elif isinstance(param_exomiser_analysis, dict):
 4673                        # Load analysis dict into analysis dict (either yaml or json)
 4674                        param_exomiser_analysis_dict = param_exomiser_analysis
 4675
 4676                    # Error analysis type
 4677                    else:
 4678                        log.error(f"Analysis type unknown. Check param file.")
 4679                        raise ValueError(f"Analysis type unknown. Check param file.")
 4680
 4681                # Case no input analysis config file/dict
 4682                # Use preset (exome/genome) to open default config file
 4683                if not param_exomiser_analysis_dict:
 4684
 4685                    # default preset
 4686                    default_preset = "exome"
 4687
 4688                    # Get param preset or default preset
 4689                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
 4690
 4691                    # Try to find if preset is a file
 4692                    if os.path.exists(param_exomiser_preset):
 4693                        # Preset file is provided in full path
 4694                        param_exomiser_analysis_default_config_file = (
 4695                            param_exomiser_preset
 4696                        )
 4697                    # elif os.path.exists(full_path(param_exomiser_preset)):
 4698                    #     # Preset file is provided in full path
 4699                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
 4700                    elif os.path.exists(
 4701                        os.path.join(folder_config, param_exomiser_preset)
 4702                    ):
 4703                        # Preset file is provided a basename in config folder (can be a path with subfolders)
 4704                        param_exomiser_analysis_default_config_file = os.path.join(
 4705                            folder_config, param_exomiser_preset
 4706                        )
 4707                    else:
 4708                        # Construct preset file
 4709                        param_exomiser_analysis_default_config_file = os.path.join(
 4710                            folder_config,
 4711                            f"preset-{param_exomiser_preset}-analysis.json",
 4712                        )
 4713
 4714                    # If preset file exists
 4715                    param_exomiser_analysis_default_config_file = full_path(
 4716                        param_exomiser_analysis_default_config_file
 4717                    )
 4718                    if os.path.exists(param_exomiser_analysis_default_config_file):
 4719                        # Load prest file into analysis dict (either yaml or json)
 4720                        with open(
 4721                            param_exomiser_analysis_default_config_file
 4722                        ) as json_file:
 4723                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
 4724                                json_file
 4725                            )
 4726
 4727                    # Error preset file
 4728                    else:
 4729                        log.error(
 4730                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4731                        )
 4732                        raise ValueError(
 4733                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4734                        )
 4735
 4736                # If no analysis dict created
 4737                if not param_exomiser_analysis_dict:
 4738                    log.error(f"No analysis config")
 4739                    raise ValueError(f"No analysis config")
 4740
 4741                # Log
 4742                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4743
 4744                ### PHENOPACKET ###
 4745                ###################
 4746
 4747                # If no PhenoPacket in analysis dict -> check in param
 4748                if "phenopacket" not in param_exomiser_analysis_dict:
 4749
 4750                    # If PhenoPacket in param -> load anlaysis json
 4751                    if param_exomiser.get("phenopacket", None):
 4752
 4753                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
 4754                        param_exomiser_phenopacket = full_path(
 4755                            param_exomiser_phenopacket
 4756                        )
 4757
 4758                        # If param phenopacket is a file and exists
 4759                        if isinstance(
 4760                            param_exomiser_phenopacket, str
 4761                        ) and os.path.exists(param_exomiser_phenopacket):
 4762                            # Load phenopacket file into analysis dict (either yaml or json)
 4763                            with open(param_exomiser_phenopacket) as json_file:
 4764                                param_exomiser_analysis_dict["phenopacket"] = (
 4765                                    yaml.safe_load(json_file)
 4766                                )
 4767
 4768                        # If param phenopacket is a dict
 4769                        elif isinstance(param_exomiser_phenopacket, dict):
 4770                            # Load phenopacket dict into analysis dict (either yaml or json)
 4771                            param_exomiser_analysis_dict["phenopacket"] = (
 4772                                param_exomiser_phenopacket
 4773                            )
 4774
 4775                        # Error phenopacket type
 4776                        else:
 4777                            log.error(f"Phenopacket type unknown. Check param file.")
 4778                            raise ValueError(
 4779                                f"Phenopacket type unknown. Check param file."
 4780                            )
 4781
 4782                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
 4783                if "phenopacket" not in param_exomiser_analysis_dict:
 4784
 4785                    # Init PhenoPacket
 4786                    param_exomiser_analysis_dict["phenopacket"] = {
 4787                        "id": "analysis",
 4788                        "proband": {},
 4789                    }
 4790
 4791                    ### Add subject ###
 4792
 4793                    # If subject exists
 4794                    param_exomiser_subject = param_exomiser.get("subject", {})
 4795
 4796                    # If subject not exists -> found sample ID
 4797                    if not param_exomiser_subject:
 4798
 4799                        # Found sample ID in param
 4800                        sample = param_exomiser.get("sample", None)
 4801
 4802                        # Find sample ID (first sample)
 4803                        if not sample:
 4804                            sample_list = self.get_header_sample_list()
 4805                            if len(sample_list) > 0:
 4806                                sample = sample_list[0]
 4807                            else:
 4808                                log.error(f"No sample found")
 4809                                raise ValueError(f"No sample found")
 4810
 4811                        # Create subject
 4812                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
 4813
 4814                    # Add to dict
 4815                    param_exomiser_analysis_dict["phenopacket"][
 4816                        "subject"
 4817                    ] = param_exomiser_subject
 4818
 4819                    ### Add "phenotypicFeatures" ###
 4820
 4821                    # If phenotypicFeatures exists
 4822                    param_exomiser_phenotypicfeatures = param_exomiser.get(
 4823                        "phenotypicFeatures", []
 4824                    )
 4825
 4826                    # If phenotypicFeatures not exists -> Try to infer from hpo list
 4827                    if not param_exomiser_phenotypicfeatures:
 4828
 4829                        # Found HPO in param
 4830                        param_exomiser_hpo = param_exomiser.get("hpo", [])
 4831
 4832                        # Split HPO if list in string format separated by comma
 4833                        if isinstance(param_exomiser_hpo, str):
 4834                            param_exomiser_hpo = param_exomiser_hpo.split(",")
 4835
 4836                        # Create HPO list
 4837                        for hpo in param_exomiser_hpo:
 4838                            hpo_clean = re.sub("[^0-9]", "", hpo)
 4839                            param_exomiser_phenotypicfeatures.append(
 4840                                {
 4841                                    "type": {
 4842                                        "id": f"HP:{hpo_clean}",
 4843                                        "label": f"HP:{hpo_clean}",
 4844                                    }
 4845                                }
 4846                            )
 4847
 4848                    # Add to dict
 4849                    param_exomiser_analysis_dict["phenopacket"][
 4850                        "phenotypicFeatures"
 4851                    ] = param_exomiser_phenotypicfeatures
 4852
 4853                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
 4854                    if not param_exomiser_phenotypicfeatures:
 4855                        for step in param_exomiser_analysis_dict.get(
 4856                            "analysis", {}
 4857                        ).get("steps", []):
 4858                            if "hiPhivePrioritiser" in step:
 4859                                param_exomiser_analysis_dict.get("analysis", {}).get(
 4860                                    "steps", []
 4861                                ).remove(step)
 4862
 4863                ### Add Input File ###
 4864
 4865                # Initial file name and htsFiles
 4866                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
 4867                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
 4868                    {
 4869                        "uri": tmp_vcf_name,
 4870                        "htsFormat": "VCF",
 4871                        "genomeAssembly": assembly,
 4872                    }
 4873                ]
 4874
 4875                ### Add metaData ###
 4876
 4877                # If metaData not in analysis dict
 4878                if "metaData" not in param_exomiser_analysis_dict:
 4879                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
 4880                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
 4881                        "createdBy": "howard",
 4882                        "phenopacketSchemaVersion": 1,
 4883                    }
 4884
 4885                ### OutputOptions ###
 4886
 4887                # Init output result folder
 4888                output_results = os.path.join(tmp_dir, "results")
 4889
 4890                # If no outputOptions in analysis dict
 4891                if "outputOptions" not in param_exomiser_analysis_dict:
 4892
 4893                    # default output formats
 4894                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
 4895
 4896                    # Get outputOptions in param
 4897                    output_options = param_exomiser.get("outputOptions", None)
 4898
 4899                    # If no output_options in param -> check
 4900                    if not output_options:
 4901                        output_options = {
 4902                            "outputContributingVariantsOnly": False,
 4903                            "numGenes": 0,
 4904                            "outputFormats": defaut_output_formats,
 4905                        }
 4906
 4907                    # Replace outputDirectory in output options
 4908                    output_options["outputDirectory"] = output_results
 4909                    output_options["outputFileName"] = "howard"
 4910
 4911                    # Add outputOptions in analysis dict
 4912                    param_exomiser_analysis_dict["outputOptions"] = output_options
 4913
 4914                else:
 4915
 4916                    # Replace output_results and output format (if exists in param)
 4917                    param_exomiser_analysis_dict["outputOptions"][
 4918                        "outputDirectory"
 4919                    ] = output_results
 4920                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
 4921                        list(
 4922                            set(
 4923                                param_exomiser_analysis_dict.get(
 4924                                    "outputOptions", {}
 4925                                ).get("outputFormats", [])
 4926                                + ["TSV_VARIANT", "VCF"]
 4927                            )
 4928                        )
 4929                    )
 4930
 4931                # log
 4932                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4933
 4934                ### ANALYSIS FILE ###
 4935                #####################
 4936
 4937                ### Full JSON analysis config file ###
 4938
 4939                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
 4940                with open(exomiser_analysis, "w") as fp:
 4941                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
 4942
 4943                ### SPLIT analysis and sample config files
 4944
 4945                # Splitted analysis dict
 4946                param_exomiser_analysis_dict_for_split = (
 4947                    param_exomiser_analysis_dict.copy()
 4948                )
 4949
 4950                # Phenopacket JSON file
 4951                exomiser_analysis_phenopacket = os.path.join(
 4952                    tmp_dir, "analysis_phenopacket.json"
 4953                )
 4954                with open(exomiser_analysis_phenopacket, "w") as fp:
 4955                    json.dump(
 4956                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
 4957                        fp,
 4958                        indent=4,
 4959                    )
 4960
 4961                # Analysis JSON file without Phenopacket parameters
 4962                param_exomiser_analysis_dict_for_split.pop("phenopacket")
 4963                exomiser_analysis_analysis = os.path.join(
 4964                    tmp_dir, "analysis_analysis.json"
 4965                )
 4966                with open(exomiser_analysis_analysis, "w") as fp:
 4967                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
 4968
 4969                ### INITAL VCF file ###
 4970                #######################
 4971
 4972                ### Create list of samples to use and include inti initial VCF file ####
 4973
 4974                # Subject (main sample)
 4975                # Get sample ID in analysis dict
 4976                sample_subject = (
 4977                    param_exomiser_analysis_dict.get("phenopacket", {})
 4978                    .get("subject", {})
 4979                    .get("id", None)
 4980                )
 4981                sample_proband = (
 4982                    param_exomiser_analysis_dict.get("phenopacket", {})
 4983                    .get("proband", {})
 4984                    .get("subject", {})
 4985                    .get("id", None)
 4986                )
 4987                sample = []
 4988                if sample_subject:
 4989                    sample.append(sample_subject)
 4990                if sample_proband:
 4991                    sample.append(sample_proband)
 4992
 4993                # Get sample ID within Pedigree
 4994                pedigree_persons_list = (
 4995                    param_exomiser_analysis_dict.get("phenopacket", {})
 4996                    .get("pedigree", {})
 4997                    .get("persons", {})
 4998                )
 4999
 5000                # Create list with all sample ID in pedigree (if exists)
 5001                pedigree_persons = []
 5002                for person in pedigree_persons_list:
 5003                    pedigree_persons.append(person.get("individualId"))
 5004
 5005                # Concat subject sample ID and samples ID in pedigreesamples
 5006                samples = list(set(sample + pedigree_persons))
 5007
 5008                # Check if sample list is not empty
 5009                if not samples:
 5010                    log.error(f"No samples found")
 5011                    raise ValueError(f"No samples found")
 5012
 5013                # Create VCF with sample (either sample in param or first one by default)
 5014                # Export VCF file
 5015                self.export_variant_vcf(
 5016                    vcf_file=tmp_vcf_name,
 5017                    remove_info=True,
 5018                    add_samples=True,
 5019                    list_samples=samples,
 5020                    index=False,
 5021                )
 5022
 5023                ### Execute Exomiser ###
 5024                ########################
 5025
 5026                # Init command
 5027                exomiser_command = ""
 5028
 5029                # Command exomiser options
 5030                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
 5031
 5032                # Release
 5033                exomiser_release = param_exomiser.get("release", None)
 5034                if exomiser_release:
 5035                    # phenotype data version
 5036                    exomiser_options += (
 5037                        f" --exomiser.phenotype.data-version={exomiser_release} "
 5038                    )
 5039                    # data version
 5040                    exomiser_options += (
 5041                        f" --exomiser.{assembly}.data-version={exomiser_release} "
 5042                    )
 5043                    # variant white list
 5044                    variant_white_list_file = (
 5045                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
 5046                    )
 5047                    if os.path.exists(
 5048                        os.path.join(
 5049                            databases_folders, assembly, variant_white_list_file
 5050                        )
 5051                    ):
 5052                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
 5053
 5054                # transcript_source
 5055                transcript_source = param_exomiser.get(
 5056                    "transcript_source", None
 5057                )  # ucsc, refseq, ensembl
 5058                if transcript_source:
 5059                    exomiser_options += (
 5060                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
 5061                    )
 5062
 5063                # If analysis contain proband param
 5064                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
 5065                    "proband", {}
 5066                ):
 5067                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
 5068
 5069                # If no proband (usually uniq sample)
 5070                else:
 5071                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
 5072
 5073                # Log
 5074                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
 5075
 5076                # Run command
 5077                result = subprocess.call(
 5078                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
 5079                )
 5080                if result:
 5081                    log.error("Exomiser command failed")
 5082                    raise ValueError("Exomiser command failed")
 5083
 5084                ### RESULTS ###
 5085                ###############
 5086
 5087                ### Annotate with TSV fields ###
 5088
 5089                # Init result tsv file
 5090                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
 5091
 5092                # Init result tsv file
 5093                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
 5094
 5095                # Parse TSV file and explode columns in INFO field
 5096                if exomiser_to_info and os.path.exists(output_results_tsv):
 5097
 5098                    # Log
 5099                    log.debug("Exomiser columns to VCF INFO field")
 5100
 5101                    # Retrieve columns and types
 5102                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
 5103                    output_results_tsv_df = self.get_query_to_df(query)
 5104                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
 5105
 5106                    # Init concat fields for update
 5107                    sql_query_update_concat_fields = []
 5108
 5109                    # Fields to avoid
 5110                    fields_to_avoid = [
 5111                        "CONTIG",
 5112                        "START",
 5113                        "END",
 5114                        "REF",
 5115                        "ALT",
 5116                        "QUAL",
 5117                        "FILTER",
 5118                        "GENOTYPE",
 5119                    ]
 5120
 5121                    # List all columns to add into header
 5122                    for header_column in output_results_tsv_columns:
 5123
 5124                        # If header column is enable
 5125                        if header_column not in fields_to_avoid:
 5126
 5127                            # Header info type
 5128                            header_info_type = "String"
 5129                            header_column_df = output_results_tsv_df[header_column]
 5130                            header_column_df_dtype = header_column_df.dtype
 5131                            if header_column_df_dtype == object:
 5132                                if (
 5133                                    pd.to_numeric(header_column_df, errors="coerce")
 5134                                    .notnull()
 5135                                    .all()
 5136                                ):
 5137                                    header_info_type = "Float"
 5138                            else:
 5139                                header_info_type = "Integer"
 5140
 5141                            # Header info
 5142                            characters_to_validate = ["-"]
 5143                            pattern = "[" + "".join(characters_to_validate) + "]"
 5144                            header_info_name = re.sub(
 5145                                pattern,
 5146                                "_",
 5147                                f"Exomiser_{header_column}".replace("#", ""),
 5148                            )
 5149                            header_info_number = "."
 5150                            header_info_description = (
 5151                                f"Exomiser {header_column} annotation"
 5152                            )
 5153                            header_info_source = "Exomiser"
 5154                            header_info_version = "unknown"
 5155                            header_info_code = CODE_TYPE_MAP[header_info_type]
 5156                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
 5157                                header_info_name,
 5158                                header_info_number,
 5159                                header_info_type,
 5160                                header_info_description,
 5161                                header_info_source,
 5162                                header_info_version,
 5163                                header_info_code,
 5164                            )
 5165
 5166                            # Add field to add for update to concat fields
 5167                            sql_query_update_concat_fields.append(
 5168                                f"""
 5169                                CASE
 5170                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
 5171                                    THEN concat(
 5172                                        '{header_info_name}=',
 5173                                        table_parquet."{header_column}",
 5174                                        ';'
 5175                                        )
 5176
 5177                                    ELSE ''
 5178                                END
 5179                            """
 5180                            )
 5181
 5182                    # Update query
 5183                    sql_query_update = f"""
 5184                        UPDATE {table_variants} as table_variants
 5185                            SET INFO = concat(
 5186                                            CASE
 5187                                                WHEN INFO NOT IN ('', '.')
 5188                                                THEN INFO
 5189                                                ELSE ''
 5190                                            END,
 5191                                            CASE
 5192                                                WHEN table_variants.INFO NOT IN ('','.')
 5193                                                THEN ';'
 5194                                                ELSE ''
 5195                                            END,
 5196                                            (
 5197                                            SELECT 
 5198                                                concat(
 5199                                                    {",".join(sql_query_update_concat_fields)}
 5200                                                )
 5201                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
 5202                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
 5203                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
 5204                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 5205                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 5206                                            )
 5207                                        )
 5208                            ;
 5209                        """
 5210
 5211                    # Update
 5212                    self.conn.execute(sql_query_update)
 5213
 5214                ### Annotate with VCF INFO field ###
 5215
 5216                # Init result VCF file
 5217                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
 5218
 5219                # If VCF exists
 5220                if os.path.exists(output_results_vcf):
 5221
 5222                    # Log
 5223                    log.debug("Exomiser result VCF update variants")
 5224
 5225                    # Find Exomiser INFO field annotation in header
 5226                    with gzip.open(output_results_vcf, "rt") as f:
 5227                        header_list = self.read_vcf_header(f)
 5228                    exomiser_vcf_header = vcf.Reader(
 5229                        io.StringIO("\n".join(header_list))
 5230                    )
 5231
 5232                    # Add annotation INFO field to header
 5233                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
 5234
 5235                    # Update variants with VCF
 5236                    self.update_from_vcf(output_results_vcf)
 5237
 5238        return True
 5239
 5240    def annotation_snpeff(self, threads: int = None) -> None:
 5241        """
 5242        This function annotate with snpEff
 5243
 5244        :param threads: The number of threads to use
 5245        :return: the value of the variable "return_value".
 5246        """
 5247
 5248        # DEBUG
 5249        log.debug("Start annotation with snpeff databases")
 5250
 5251        # Threads
 5252        if not threads:
 5253            threads = self.get_threads()
 5254        log.debug("Threads: " + str(threads))
 5255
 5256        # DEBUG
 5257        delete_tmp = True
 5258        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5259            delete_tmp = False
 5260            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5261
 5262        # Config
 5263        config = self.get_config()
 5264        log.debug("Config: " + str(config))
 5265
 5266        # Config - Folders - Databases
 5267        databases_folders = (
 5268            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
 5269        )
 5270        log.debug("Databases annotations: " + str(databases_folders))
 5271
 5272        # Config - snpEff bin command
 5273        snpeff_bin_command = get_bin_command(
 5274            bin="snpEff.jar",
 5275            tool="snpeff",
 5276            bin_type="jar",
 5277            config=config,
 5278            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 5279        )
 5280        if not snpeff_bin_command:
 5281            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
 5282            log.error(msg_err)
 5283            raise ValueError(msg_err)
 5284
 5285        # Config - snpEff databases
 5286        snpeff_databases = (
 5287            config.get("folders", {})
 5288            .get("databases", {})
 5289            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
 5290        )
 5291        snpeff_databases = full_path(snpeff_databases)
 5292        if snpeff_databases is not None and snpeff_databases != "":
 5293            log.debug(f"Create snpEff databases folder")
 5294            if not os.path.exists(snpeff_databases):
 5295                os.makedirs(snpeff_databases)
 5296
 5297        # Param
 5298        param = self.get_param()
 5299        log.debug("Param: " + str(param))
 5300
 5301        # Param
 5302        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
 5303        log.debug("Options: " + str(options))
 5304
 5305        # Param - Assembly
 5306        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5307
 5308        # Param - Options
 5309        snpeff_options = (
 5310            param.get("annotation", {}).get("snpeff", {}).get("options", "")
 5311        )
 5312        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
 5313        snpeff_csvstats = (
 5314            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
 5315        )
 5316        if snpeff_stats:
 5317            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
 5318            snpeff_stats = full_path(snpeff_stats)
 5319            snpeff_options += f" -stats {snpeff_stats}"
 5320        if snpeff_csvstats:
 5321            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
 5322            snpeff_csvstats = full_path(snpeff_csvstats)
 5323            snpeff_options += f" -csvStats {snpeff_csvstats}"
 5324
 5325        # Data
 5326        table_variants = self.get_table_variants()
 5327
 5328        # Check if not empty
 5329        log.debug("Check if not empty")
 5330        sql_query_chromosomes = (
 5331            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5332        )
 5333        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
 5334        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 5335            log.info(f"VCF empty")
 5336            return
 5337
 5338        # Export in VCF
 5339        log.debug("Create initial file to annotate")
 5340        tmp_vcf = NamedTemporaryFile(
 5341            prefix=self.get_prefix(),
 5342            dir=self.get_tmp_dir(),
 5343            suffix=".vcf.gz",
 5344            delete=True,
 5345        )
 5346        tmp_vcf_name = tmp_vcf.name
 5347
 5348        # VCF header
 5349        vcf_reader = self.get_header()
 5350        log.debug("Initial header: " + str(vcf_reader.infos))
 5351
 5352        # Existing annotations
 5353        for vcf_annotation in self.get_header().infos:
 5354
 5355            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5356            log.debug(
 5357                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5358            )
 5359
 5360        # Memory limit
 5361        # if config.get("memory", None):
 5362        #     memory_limit = config.get("memory", "8G")
 5363        # else:
 5364        #     memory_limit = "8G"
 5365        memory_limit = self.get_memory("8G")
 5366        log.debug(f"memory_limit: {memory_limit}")
 5367
 5368        # snpEff java options
 5369        snpeff_java_options = (
 5370            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 5371        )
 5372        log.debug(f"Exomiser java options: {snpeff_java_options}")
 5373
 5374        force_update_annotation = True
 5375
 5376        if "ANN" not in self.get_header().infos or force_update_annotation:
 5377
 5378            # Check snpEff database
 5379            log.debug(f"Check snpEff databases {[assembly]}")
 5380            databases_download_snpeff(
 5381                folder=snpeff_databases, assemblies=[assembly], config=config
 5382            )
 5383
 5384            # Export VCF file
 5385            self.export_variant_vcf(
 5386                vcf_file=tmp_vcf_name,
 5387                remove_info=True,
 5388                add_samples=False,
 5389                index=True,
 5390            )
 5391
 5392            # Tmp file
 5393            err_files = []
 5394            tmp_annotate_vcf = NamedTemporaryFile(
 5395                prefix=self.get_prefix(),
 5396                dir=self.get_tmp_dir(),
 5397                suffix=".vcf",
 5398                delete=False,
 5399            )
 5400            tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5401            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5402            err_files.append(tmp_annotate_vcf_name_err)
 5403
 5404            # Command
 5405            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
 5406            log.debug(f"Annotation - snpEff command: {snpeff_command}")
 5407            run_parallel_commands([snpeff_command], 1)
 5408
 5409            # Error messages
 5410            log.info(f"Error/Warning messages:")
 5411            error_message_command_all = []
 5412            error_message_command_warning = []
 5413            error_message_command_err = []
 5414            for err_file in err_files:
 5415                with open(err_file, "r") as f:
 5416                    for line in f:
 5417                        message = line.strip()
 5418                        error_message_command_all.append(message)
 5419                        if line.startswith("[W::"):
 5420                            error_message_command_warning.append(message)
 5421                        if line.startswith("[E::"):
 5422                            error_message_command_err.append(f"{err_file}: " + message)
 5423            # log info
 5424            for message in list(
 5425                set(error_message_command_err + error_message_command_warning)
 5426            ):
 5427                log.info(f"   {message}")
 5428            # debug info
 5429            for message in list(set(error_message_command_all)):
 5430                log.debug(f"   {message}")
 5431            # failed
 5432            if len(error_message_command_err):
 5433                log.error("Annotation failed: Error in commands")
 5434                raise ValueError("Annotation failed: Error in commands")
 5435
 5436            # Find annotation in header
 5437            with open(tmp_annotate_vcf_name, "rt") as f:
 5438                header_list = self.read_vcf_header(f)
 5439            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5440
 5441            for ann in annovar_vcf_header.infos:
 5442                if ann not in self.get_header().infos:
 5443                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5444
 5445            # Update variants
 5446            log.info(f"Annotation - Updating...")
 5447            self.update_from_vcf(tmp_annotate_vcf_name)
 5448
 5449        else:
 5450            if "ANN" in self.get_header().infos:
 5451                log.debug(f"Existing snpEff annotations in VCF")
 5452            if force_update_annotation:
 5453                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
 5454
 5455    def annotation_annovar(self, threads: int = None) -> None:
 5456        """
 5457        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
 5458        annotations
 5459
 5460        :param threads: number of threads to use
 5461        :return: the value of the variable "return_value".
 5462        """
 5463
 5464        # DEBUG
 5465        log.debug("Start annotation with Annovar databases")
 5466
 5467        # Threads
 5468        if not threads:
 5469            threads = self.get_threads()
 5470        log.debug("Threads: " + str(threads))
 5471
 5472        # Tmp en Err files
 5473        tmp_files = []
 5474        err_files = []
 5475
 5476        # DEBUG
 5477        delete_tmp = True
 5478        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5479            delete_tmp = False
 5480            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5481
 5482        # Config
 5483        config = self.get_config()
 5484        log.debug("Config: " + str(config))
 5485
 5486        # Config - Folders - Databases
 5487        databases_folders = (
 5488            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
 5489        )
 5490        log.debug("Databases annotations: " + str(databases_folders))
 5491
 5492        # Config - annovar bin command
 5493        annovar_bin_command = get_bin_command(
 5494            bin="table_annovar.pl",
 5495            tool="annovar",
 5496            bin_type="perl",
 5497            config=config,
 5498            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
 5499        )
 5500        if not annovar_bin_command:
 5501            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
 5502            log.error(msg_err)
 5503            raise ValueError(msg_err)
 5504
 5505        # Config - BCFTools bin command
 5506        bcftools_bin_command = get_bin_command(
 5507            bin="bcftools",
 5508            tool="bcftools",
 5509            bin_type="bin",
 5510            config=config,
 5511            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 5512        )
 5513        if not bcftools_bin_command:
 5514            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 5515            log.error(msg_err)
 5516            raise ValueError(msg_err)
 5517
 5518        # Config - annovar databases
 5519        annovar_databases = (
 5520            config.get("folders", {})
 5521            .get("databases", {})
 5522            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
 5523        )
 5524        if annovar_databases is not None:
 5525            if isinstance(annovar_databases, list):
 5526                annovar_databases = full_path(annovar_databases[0])
 5527                log.warning(f"Annovar databases folder '{annovar_databases}' selected")
 5528            annovar_databases = full_path(annovar_databases)
 5529            if not os.path.exists(annovar_databases):
 5530                log.info(f"Annovar databases folder '{annovar_databases}' created")
 5531                Path(annovar_databases).mkdir(parents=True, exist_ok=True)
 5532        else:
 5533            msg_err = f"Annovar databases configuration failed"
 5534            log.error(msg_err)
 5535            raise ValueError(msg_err)
 5536
 5537        # Param
 5538        param = self.get_param()
 5539        log.debug("Param: " + str(param))
 5540
 5541        # Param - options
 5542        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
 5543        log.debug("Options: " + str(options))
 5544
 5545        # Param - annotations
 5546        annotations = (
 5547            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
 5548        )
 5549        log.debug("Annotations: " + str(annotations))
 5550
 5551        # Param - Assembly
 5552        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5553
 5554        # Annovar database assembly
 5555        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
 5556        if annovar_databases_assembly != "" and not os.path.exists(
 5557            annovar_databases_assembly
 5558        ):
 5559            os.makedirs(annovar_databases_assembly)
 5560
 5561        # Data
 5562        table_variants = self.get_table_variants()
 5563
 5564        # Check if not empty
 5565        log.debug("Check if not empty")
 5566        sql_query_chromosomes = (
 5567            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5568        )
 5569        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 5570        if not sql_query_chromosomes_df["count"][0]:
 5571            log.info(f"VCF empty")
 5572            return
 5573
 5574        # VCF header
 5575        vcf_reader = self.get_header()
 5576        log.debug("Initial header: " + str(vcf_reader.infos))
 5577
 5578        # Existing annotations
 5579        for vcf_annotation in self.get_header().infos:
 5580
 5581            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5582            log.debug(
 5583                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5584            )
 5585
 5586        force_update_annotation = True
 5587
 5588        if annotations:
 5589
 5590            commands = []
 5591            tmp_annotates_vcf_name_list = []
 5592
 5593            # Export in VCF
 5594            log.debug("Create initial file to annotate")
 5595            tmp_vcf = NamedTemporaryFile(
 5596                prefix=self.get_prefix(),
 5597                dir=self.get_tmp_dir(),
 5598                suffix=".vcf.gz",
 5599                delete=False,
 5600            )
 5601            tmp_vcf_name = tmp_vcf.name
 5602            tmp_files.append(tmp_vcf_name)
 5603            tmp_files.append(tmp_vcf_name + ".tbi")
 5604
 5605            # Export VCF file
 5606            self.export_variant_vcf(
 5607                vcf_file=tmp_vcf_name,
 5608                remove_info=".",
 5609                add_samples=False,
 5610                index=True,
 5611            )
 5612
 5613            # Create file for field rename
 5614            log.debug("Create file for field rename")
 5615            tmp_rename = NamedTemporaryFile(
 5616                prefix=self.get_prefix(),
 5617                dir=self.get_tmp_dir(),
 5618                suffix=".rename",
 5619                delete=False,
 5620            )
 5621            tmp_rename_name = tmp_rename.name
 5622            tmp_files.append(tmp_rename_name)
 5623
 5624            # Check Annovar database
 5625            log.debug(
 5626                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
 5627            )
 5628            databases_download_annovar(
 5629                folder=annovar_databases,
 5630                files=list(annotations.keys()),
 5631                assemblies=[assembly],
 5632            )
 5633
 5634            for annotation in annotations:
 5635                annotation_fields = annotations[annotation]
 5636
 5637                if not annotation_fields:
 5638                    annotation_fields = {"INFO": None}
 5639
 5640                log.info(f"Annotations Annovar - database '{annotation}'")
 5641                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
 5642
 5643                # Tmp file for annovar
 5644                err_files = []
 5645                tmp_annotate_vcf_directory = TemporaryDirectory(
 5646                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
 5647                )
 5648                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
 5649                tmp_annotate_vcf_name_annovar = (
 5650                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
 5651                )
 5652                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
 5653                err_files.append(tmp_annotate_vcf_name_err)
 5654                tmp_files.append(tmp_annotate_vcf_name_err)
 5655
 5656                # Tmp file final vcf annotated by annovar
 5657                tmp_annotate_vcf = NamedTemporaryFile(
 5658                    prefix=self.get_prefix(),
 5659                    dir=self.get_tmp_dir(),
 5660                    suffix=".vcf.gz",
 5661                    delete=False,
 5662                )
 5663                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5664                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
 5665                tmp_files.append(tmp_annotate_vcf_name)
 5666                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
 5667
 5668                # Number of fields
 5669                annotation_list = []
 5670                annotation_renamed_list = []
 5671
 5672                for annotation_field in annotation_fields:
 5673
 5674                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 5675                    annotation_fields_new_name = annotation_fields.get(
 5676                        annotation_field, annotation_field
 5677                    )
 5678                    if not annotation_fields_new_name:
 5679                        annotation_fields_new_name = annotation_field
 5680
 5681                    if (
 5682                        force_update_annotation
 5683                        or annotation_fields_new_name not in self.get_header().infos
 5684                    ):
 5685                        annotation_list.append(annotation_field)
 5686                        annotation_renamed_list.append(annotation_fields_new_name)
 5687                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
 5688                        log.warning(
 5689                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 5690                        )
 5691
 5692                    # Add rename info
 5693                    run_parallel_commands(
 5694                        [
 5695                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
 5696                        ],
 5697                        1,
 5698                    )
 5699
 5700                # log.debug("fields_to_removed: " + str(fields_to_removed))
 5701                log.debug("annotation_list: " + str(annotation_list))
 5702
 5703                # protocol
 5704                protocol = annotation
 5705
 5706                # argument
 5707                argument = ""
 5708
 5709                # operation
 5710                operation = "f"
 5711                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
 5712                    "ensGene"
 5713                ):
 5714                    operation = "g"
 5715                    if options.get("genebase", None):
 5716                        argument = f"""'{options.get("genebase","")}'"""
 5717                elif annotation in ["cytoBand"]:
 5718                    operation = "r"
 5719
 5720                # argument option
 5721                argument_option = ""
 5722                if argument != "":
 5723                    argument_option = " --argument " + argument
 5724
 5725                # command options
 5726                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
 5727                for option in options:
 5728                    if option not in ["genebase"]:
 5729                        command_options += f""" --{option}={options[option]}"""
 5730
 5731                # Command
 5732
 5733                # Command - Annovar
 5734                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
 5735                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
 5736
 5737                # Command - start pipe
 5738                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
 5739
 5740                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
 5741                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
 5742
 5743                # Command - Special characters (refGene annotation)
 5744                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
 5745
 5746                # Command - Clean empty fields (with value ".")
 5747                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
 5748
 5749                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
 5750                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
 5751                if "ALL" not in annotation_list and "INFO" not in annotation_list:
 5752                    # for ann in annotation_renamed_list:
 5753                    for ann in annotation_list:
 5754                        annovar_fields_to_keep.append(f"^INFO/{ann}")
 5755
 5756                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
 5757
 5758                # Command - indexing
 5759                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
 5760
 5761                log.debug(f"Annotation - Annovar command: {command_annovar}")
 5762                run_parallel_commands([command_annovar], 1)
 5763
 5764                # Error messages
 5765                log.info(f"Error/Warning messages:")
 5766                error_message_command_all = []
 5767                error_message_command_warning = []
 5768                error_message_command_err = []
 5769                for err_file in err_files:
 5770                    with open(err_file, "r") as f:
 5771                        for line in f:
 5772                            message = line.strip()
 5773                            error_message_command_all.append(message)
 5774                            if line.startswith("[W::") or line.startswith("WARNING"):
 5775                                error_message_command_warning.append(message)
 5776                            if line.startswith("[E::") or line.startswith("ERROR"):
 5777                                error_message_command_err.append(
 5778                                    f"{err_file}: " + message
 5779                                )
 5780                # log info
 5781                for message in list(
 5782                    set(error_message_command_err + error_message_command_warning)
 5783                ):
 5784                    log.info(f"   {message}")
 5785                # debug info
 5786                for message in list(set(error_message_command_all)):
 5787                    log.debug(f"   {message}")
 5788                # failed
 5789                if len(error_message_command_err):
 5790                    log.error("Annotation failed: Error in commands")
 5791                    raise ValueError("Annotation failed: Error in commands")
 5792
 5793            if tmp_annotates_vcf_name_list:
 5794
 5795                # List of annotated files
 5796                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
 5797
 5798                # Tmp file
 5799                tmp_annotate_vcf = NamedTemporaryFile(
 5800                    prefix=self.get_prefix(),
 5801                    dir=self.get_tmp_dir(),
 5802                    suffix=".vcf.gz",
 5803                    delete=False,
 5804                )
 5805                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5806                tmp_files.append(tmp_annotate_vcf_name)
 5807                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5808                err_files.append(tmp_annotate_vcf_name_err)
 5809                tmp_files.append(tmp_annotate_vcf_name_err)
 5810
 5811                # Command merge
 5812                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
 5813                log.info(
 5814                    f"Annotation Annovar - Annotation merging "
 5815                    + str(len(tmp_annotates_vcf_name_list))
 5816                    + " annotated files"
 5817                )
 5818                log.debug(f"Annotation - merge command: {merge_command}")
 5819                run_parallel_commands([merge_command], 1)
 5820
 5821                # Find annotation in header
 5822                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
 5823                    header_list = self.read_vcf_header(f)
 5824                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5825
 5826                for ann in annovar_vcf_header.infos:
 5827                    if ann not in self.get_header().infos:
 5828                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5829
 5830                # Update variants
 5831                log.info(f"Annotation Annovar - Updating...")
 5832                self.update_from_vcf(tmp_annotate_vcf_name)
 5833
 5834            # Clean files
 5835            # Tmp file remove command
 5836            if True:
 5837                tmp_files_remove_command = ""
 5838                if tmp_files:
 5839                    tmp_files_remove_command = " ".join(tmp_files)
 5840                clean_command = f" rm -f {tmp_files_remove_command} "
 5841                log.debug(f"Annotation Annovar - Annotation cleaning ")
 5842                log.debug(f"Annotation - cleaning command: {clean_command}")
 5843                run_parallel_commands([clean_command], 1)
 5844
 5845    # Parquet
 5846    def annotation_parquet(self, threads: int = None) -> None:
 5847        """
 5848        It takes a VCF file, and annotates it with a parquet file
 5849
 5850        :param threads: number of threads to use for the annotation
 5851        :return: the value of the variable "result".
 5852        """
 5853
 5854        # DEBUG
 5855        log.debug("Start annotation with parquet databases")
 5856
 5857        # Threads
 5858        if not threads:
 5859            threads = self.get_threads()
 5860        log.debug("Threads: " + str(threads))
 5861
 5862        # DEBUG
 5863        delete_tmp = True
 5864        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5865            delete_tmp = False
 5866            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5867
 5868        # Config
 5869        databases_folders = set(
 5870            self.get_config()
 5871            .get("folders", {})
 5872            .get("databases", {})
 5873            .get("annotations", ["."])
 5874            + self.get_config()
 5875            .get("folders", {})
 5876            .get("databases", {})
 5877            .get("parquet", ["."])
 5878        )
 5879        log.debug("Databases annotations: " + str(databases_folders))
 5880
 5881        # Param
 5882        annotations = (
 5883            self.get_param()
 5884            .get("annotation", {})
 5885            .get("parquet", {})
 5886            .get("annotations", None)
 5887        )
 5888        log.debug("Annotations: " + str(annotations))
 5889
 5890        # Assembly
 5891        assembly = self.get_param().get(
 5892            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 5893        )
 5894
 5895        # Force Update Annotation
 5896        force_update_annotation = (
 5897            self.get_param()
 5898            .get("annotation", {})
 5899            .get("options", {})
 5900            .get("annotations_update", False)
 5901        )
 5902        log.debug(f"force_update_annotation={force_update_annotation}")
 5903        force_append_annotation = (
 5904            self.get_param()
 5905            .get("annotation", {})
 5906            .get("options", {})
 5907            .get("annotations_append", False)
 5908        )
 5909        log.debug(f"force_append_annotation={force_append_annotation}")
 5910
 5911        # Data
 5912        table_variants = self.get_table_variants()
 5913
 5914        # Check if not empty
 5915        log.debug("Check if not empty")
 5916        sql_query_chromosomes_df = self.get_query_to_df(
 5917            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
 5918        )
 5919        if not sql_query_chromosomes_df["count"][0]:
 5920            log.info(f"VCF empty")
 5921            return
 5922
 5923        # VCF header
 5924        vcf_reader = self.get_header()
 5925        log.debug("Initial header: " + str(vcf_reader.infos))
 5926
 5927        # Nb Variants POS
 5928        log.debug("NB Variants Start")
 5929        nb_variants = self.conn.execute(
 5930            f"SELECT count(*) AS count FROM variants"
 5931        ).fetchdf()["count"][0]
 5932        log.debug("NB Variants Stop")
 5933
 5934        # Existing annotations
 5935        for vcf_annotation in self.get_header().infos:
 5936
 5937            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5938            log.debug(
 5939                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5940            )
 5941
 5942        # Added columns
 5943        added_columns = []
 5944
 5945        # drop indexes
 5946        log.debug(f"Drop indexes...")
 5947        self.drop_indexes()
 5948
 5949        if annotations:
 5950
 5951            if "ALL" in annotations:
 5952
 5953                all_param = annotations.get("ALL", {})
 5954                all_param_formats = all_param.get("formats", None)
 5955                all_param_releases = all_param.get("releases", None)
 5956
 5957                databases_infos_dict = self.scan_databases(
 5958                    database_formats=all_param_formats,
 5959                    database_releases=all_param_releases,
 5960                )
 5961                for database_infos in databases_infos_dict.keys():
 5962                    if database_infos not in annotations:
 5963                        annotations[database_infos] = {"INFO": None}
 5964
 5965            for annotation in annotations:
 5966
 5967                if annotation in ["ALL"]:
 5968                    continue
 5969
 5970                # Annotation Name
 5971                annotation_name = os.path.basename(annotation)
 5972
 5973                # Annotation fields
 5974                annotation_fields = annotations[annotation]
 5975                if not annotation_fields:
 5976                    annotation_fields = {"INFO": None}
 5977
 5978                log.debug(f"Annotation '{annotation_name}'")
 5979                log.debug(
 5980                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 5981                )
 5982
 5983                # Create Database
 5984                database = Database(
 5985                    database=annotation,
 5986                    databases_folders=databases_folders,
 5987                    assembly=assembly,
 5988                )
 5989
 5990                # Find files
 5991                parquet_file = database.get_database()
 5992                parquet_hdr_file = database.get_header_file()
 5993                parquet_type = database.get_type()
 5994
 5995                # Check if files exists
 5996                if not parquet_file or not parquet_hdr_file:
 5997                    msg_err_list = []
 5998                    if not parquet_file:
 5999                        msg_err_list.append(
 6000                            f"Annotation failed: Annotation file not found"
 6001                        )
 6002                    if parquet_file and not parquet_hdr_file:
 6003                        msg_err_list.append(
 6004                            f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'"
 6005                        )
 6006
 6007                    log.error(". ".join(msg_err_list))
 6008                    raise ValueError(". ".join(msg_err_list))
 6009                else:
 6010                    # Get parquet connexion
 6011                    parquet_sql_attach = database.get_sql_database_attach(
 6012                        output="query"
 6013                    )
 6014                    if parquet_sql_attach:
 6015                        self.conn.execute(parquet_sql_attach)
 6016                    parquet_file_link = database.get_sql_database_link()
 6017                    # Log
 6018                    log.debug(
 6019                        f"Annotation '{annotation_name}' - file: "
 6020                        + str(parquet_file)
 6021                        + " and "
 6022                        + str(parquet_hdr_file)
 6023                    )
 6024
 6025                    # Database full header columns
 6026                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
 6027                        parquet_hdr_file
 6028                    )
 6029                    # Log
 6030                    log.debug(
 6031                        "Annotation database header columns : "
 6032                        + str(parquet_hdr_vcf_header_columns)
 6033                    )
 6034
 6035                    # Load header as VCF object
 6036                    parquet_hdr_vcf_header_infos = database.get_header().infos
 6037                    # Log
 6038                    log.debug(
 6039                        "Annotation database header: "
 6040                        + str(parquet_hdr_vcf_header_infos)
 6041                    )
 6042
 6043                    # Get extra infos
 6044                    parquet_columns = database.get_extra_columns()
 6045                    # Log
 6046                    log.debug("Annotation database Columns: " + str(parquet_columns))
 6047
 6048                    # Add extra columns if "ALL" in annotation_fields
 6049                    # if "ALL" in annotation_fields:
 6050                    #     allow_add_extra_column = True
 6051                    if "ALL" in annotation_fields and database.get_extra_columns():
 6052                        for extra_column in database.get_extra_columns():
 6053                            if (
 6054                                extra_column not in annotation_fields
 6055                                and extra_column.replace("INFO/", "")
 6056                                not in parquet_hdr_vcf_header_infos
 6057                            ):
 6058                                parquet_hdr_vcf_header_infos[extra_column] = (
 6059                                    vcf.parser._Info(
 6060                                        extra_column,
 6061                                        ".",
 6062                                        "String",
 6063                                        f"{extra_column} description",
 6064                                        "unknown",
 6065                                        "unknown",
 6066                                        self.code_type_map["String"],
 6067                                    )
 6068                                )
 6069
 6070                    # For all fields in database
 6071                    annotation_fields_all = False
 6072                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 6073                        annotation_fields_all = True
 6074                        annotation_fields = {
 6075                            key: key for key in parquet_hdr_vcf_header_infos
 6076                        }
 6077
 6078                        log.debug(
 6079                            "Annotation database header - All annotations added: "
 6080                            + str(annotation_fields)
 6081                        )
 6082
 6083                    # Init
 6084
 6085                    # List of annotation fields to use
 6086                    sql_query_annotation_update_info_sets = []
 6087
 6088                    # List of annotation to agregate
 6089                    sql_query_annotation_to_agregate = []
 6090
 6091                    # Number of fields
 6092                    nb_annotation_field = 0
 6093
 6094                    # Annotation fields processed
 6095                    annotation_fields_processed = []
 6096
 6097                    # Columns mapping
 6098                    map_columns = database.map_columns(
 6099                        columns=annotation_fields, prefixes=["INFO/"]
 6100                    )
 6101
 6102                    # Query dict for fields to remove (update option)
 6103                    query_dict_remove = {}
 6104
 6105                    # Fetch Anotation fields
 6106                    for annotation_field in annotation_fields:
 6107
 6108                        # annotation_field_column
 6109                        annotation_field_column = map_columns.get(
 6110                            annotation_field, "INFO"
 6111                        )
 6112
 6113                        # field new name, if parametered
 6114                        annotation_fields_new_name = annotation_fields.get(
 6115                            annotation_field, annotation_field
 6116                        )
 6117                        if not annotation_fields_new_name:
 6118                            annotation_fields_new_name = annotation_field
 6119
 6120                        # To annotate
 6121                        # force_update_annotation = True
 6122                        # force_append_annotation = True
 6123                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
 6124                        if annotation_field in parquet_hdr_vcf_header_infos and (
 6125                            force_update_annotation
 6126                            or force_append_annotation
 6127                            or (
 6128                                annotation_fields_new_name
 6129                                not in self.get_header().infos
 6130                            )
 6131                        ):
 6132
 6133                            # Add field to annotation to process list
 6134                            annotation_fields_processed.append(
 6135                                annotation_fields_new_name
 6136                            )
 6137
 6138                            # explode infos for the field
 6139                            annotation_fields_new_name_info_msg = ""
 6140                            if (
 6141                                force_update_annotation
 6142                                and annotation_fields_new_name
 6143                                in self.get_header().infos
 6144                            ):
 6145                                # Remove field from INFO
 6146                                query = f"""
 6147                                    UPDATE {table_variants} as table_variants
 6148                                    SET INFO = REGEXP_REPLACE(
 6149                                                concat(table_variants.INFO,''),
 6150                                                ';*{annotation_fields_new_name}=[^;]*',
 6151                                                ''
 6152                                                )
 6153                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
 6154                                """
 6155                                annotation_fields_new_name_info_msg = " [update]"
 6156                                query_dict_remove[
 6157                                    f"remove 'INFO/{annotation_fields_new_name}'"
 6158                                ] = query
 6159
 6160                            # Sep between fields in INFO
 6161                            nb_annotation_field += 1
 6162                            if nb_annotation_field > 1:
 6163                                annotation_field_sep = ";"
 6164                            else:
 6165                                annotation_field_sep = ""
 6166
 6167                            log.info(
 6168                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
 6169                            )
 6170
 6171                            # Add INFO field to header
 6172                            parquet_hdr_vcf_header_infos_number = (
 6173                                parquet_hdr_vcf_header_infos[annotation_field].num
 6174                                or "."
 6175                            )
 6176                            parquet_hdr_vcf_header_infos_type = (
 6177                                parquet_hdr_vcf_header_infos[annotation_field].type
 6178                                or "String"
 6179                            )
 6180                            parquet_hdr_vcf_header_infos_description = (
 6181                                parquet_hdr_vcf_header_infos[annotation_field].desc
 6182                                or f"{annotation_field} description"
 6183                            )
 6184                            parquet_hdr_vcf_header_infos_source = (
 6185                                parquet_hdr_vcf_header_infos[annotation_field].source
 6186                                or "unknown"
 6187                            )
 6188                            parquet_hdr_vcf_header_infos_version = (
 6189                                parquet_hdr_vcf_header_infos[annotation_field].version
 6190                                or "unknown"
 6191                            )
 6192
 6193                            vcf_reader.infos[annotation_fields_new_name] = (
 6194                                vcf.parser._Info(
 6195                                    annotation_fields_new_name,
 6196                                    parquet_hdr_vcf_header_infos_number,
 6197                                    parquet_hdr_vcf_header_infos_type,
 6198                                    parquet_hdr_vcf_header_infos_description,
 6199                                    parquet_hdr_vcf_header_infos_source,
 6200                                    parquet_hdr_vcf_header_infos_version,
 6201                                    self.code_type_map[
 6202                                        parquet_hdr_vcf_header_infos_type
 6203                                    ],
 6204                                )
 6205                            )
 6206
 6207                            # Append
 6208                            if force_append_annotation:
 6209                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
 6210                            else:
 6211                                query_case_when_append = ""
 6212
 6213                            # Annotation/Update query fields
 6214                            # Found in INFO column
 6215                            if (
 6216                                annotation_field_column == "INFO"
 6217                                and "INFO" in parquet_hdr_vcf_header_columns
 6218                            ):
 6219                                sql_query_annotation_update_info_sets.append(
 6220                                    f"""
 6221                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
 6222                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
 6223                                        ELSE ''
 6224                                    END
 6225                                """
 6226                                )
 6227                            # Found in a specific column
 6228                            else:
 6229                                sql_query_annotation_update_info_sets.append(
 6230                                    f"""
 6231                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
 6232                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
 6233                                        ELSE ''
 6234                                    END
 6235                                """
 6236                                )
 6237                                sql_query_annotation_to_agregate.append(
 6238                                    f""" string_agg(table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
 6239                                )
 6240
 6241                        # Not to annotate
 6242                        else:
 6243
 6244                            if force_update_annotation:
 6245                                annotation_message = "forced"
 6246                            else:
 6247                                annotation_message = "skipped"
 6248
 6249                            if annotation_field not in parquet_hdr_vcf_header_infos:
 6250                                log.warning(
 6251                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
 6252                                )
 6253                            if annotation_fields_new_name in self.get_header().infos:
 6254                                log.warning(
 6255                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
 6256                                )
 6257
 6258                    # Check if ALL fields have to be annotated. Thus concat all INFO field
 6259                    # allow_annotation_full_info = True
 6260                    allow_annotation_full_info = not force_append_annotation
 6261
 6262                    if parquet_type in ["regions"]:
 6263                        allow_annotation_full_info = False
 6264
 6265                    if (
 6266                        allow_annotation_full_info
 6267                        and nb_annotation_field == len(annotation_fields)
 6268                        and annotation_fields_all
 6269                        and (
 6270                            "INFO" in parquet_hdr_vcf_header_columns
 6271                            and "INFO" in database.get_extra_columns()
 6272                        )
 6273                    ):
 6274                        log.debug("Column INFO annotation enabled")
 6275                        sql_query_annotation_update_info_sets = []
 6276                        sql_query_annotation_update_info_sets.append(
 6277                            f" table_parquet.INFO "
 6278                        )
 6279
 6280                    if sql_query_annotation_update_info_sets:
 6281
 6282                        # Annotate
 6283                        log.info(f"Annotation '{annotation_name}' - Annotation...")
 6284
 6285                        # Join query annotation update info sets for SQL
 6286                        sql_query_annotation_update_info_sets_sql = ",".join(
 6287                            sql_query_annotation_update_info_sets
 6288                        )
 6289
 6290                        # Check chromosomes list (and variants infos)
 6291                        sql_query_chromosomes = f"""
 6292                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
 6293                            FROM {table_variants} as table_variants
 6294                            GROUP BY table_variants."#CHROM"
 6295                            ORDER BY table_variants."#CHROM"
 6296                            """
 6297                        sql_query_chromosomes_df = self.conn.execute(
 6298                            sql_query_chromosomes
 6299                        ).df()
 6300                        sql_query_chromosomes_dict = {
 6301                            entry["CHROM"]: {
 6302                                "count": entry["count_variants"],
 6303                                "min": entry["min_variants"],
 6304                                "max": entry["max_variants"],
 6305                            }
 6306                            for index, entry in sql_query_chromosomes_df.iterrows()
 6307                        }
 6308
 6309                        # Init
 6310                        nb_of_query = 0
 6311                        nb_of_variant_annotated = 0
 6312                        query_dict = query_dict_remove
 6313
 6314                        # for chrom in sql_query_chromosomes_df["CHROM"]:
 6315                        for chrom in sql_query_chromosomes_dict:
 6316
 6317                            # Number of variant by chromosome
 6318                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
 6319                                chrom, {}
 6320                            ).get("count", 0)
 6321
 6322                            log.debug(
 6323                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
 6324                            )
 6325
 6326                            # Annotation with regions database
 6327                            if parquet_type in ["regions"]:
 6328                                sql_query_annotation_from_clause = f"""
 6329                                    FROM (
 6330                                        SELECT 
 6331                                            '{chrom}' AS \"#CHROM\",
 6332                                            table_variants_from.\"POS\" AS \"POS\",
 6333                                            {",".join(sql_query_annotation_to_agregate)}
 6334                                        FROM {table_variants} as table_variants_from
 6335                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
 6336                                            table_parquet_from."#CHROM" = '{chrom}'
 6337                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
 6338                                            AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
 6339                                        )
 6340                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
 6341                                        GROUP BY table_variants_from.\"POS\"
 6342                                        )
 6343                                        as table_parquet
 6344                                """
 6345
 6346                                sql_query_annotation_where_clause = """
 6347                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
 6348                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6349                                """
 6350
 6351                            # Annotation with variants database
 6352                            else:
 6353                                sql_query_annotation_from_clause = f"""
 6354                                    FROM {parquet_file_link} as table_parquet
 6355                                """
 6356                                sql_query_annotation_where_clause = f"""
 6357                                    table_variants."#CHROM" = '{chrom}'
 6358                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
 6359                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6360                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 6361                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 6362                                """
 6363
 6364                            # Create update query
 6365                            sql_query_annotation_chrom_interval_pos = f"""
 6366                                UPDATE {table_variants} as table_variants
 6367                                    SET INFO = 
 6368                                        concat(
 6369                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6370                                                THEN table_variants.INFO
 6371                                                ELSE ''
 6372                                            END
 6373                                            ,
 6374                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6375                                                        AND (
 6376                                                        concat({sql_query_annotation_update_info_sets_sql})
 6377                                                        )
 6378                                                        NOT IN ('','.') 
 6379                                                    THEN ';'
 6380                                                    ELSE ''
 6381                                            END
 6382                                            ,
 6383                                            {sql_query_annotation_update_info_sets_sql}
 6384                                            )
 6385                                    {sql_query_annotation_from_clause}
 6386                                    WHERE {sql_query_annotation_where_clause}
 6387                                    ;
 6388                                """
 6389
 6390                            # Add update query to dict
 6391                            query_dict[
 6392                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
 6393                            ] = sql_query_annotation_chrom_interval_pos
 6394
 6395                        nb_of_query = len(query_dict)
 6396                        num_query = 0
 6397
 6398                        # SET max_expression_depth TO x
 6399                        self.conn.execute("SET max_expression_depth TO 10000")
 6400
 6401                        for query_name in query_dict:
 6402                            query = query_dict[query_name]
 6403                            num_query += 1
 6404                            log.info(
 6405                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
 6406                            )
 6407                            result = self.conn.execute(query)
 6408                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
 6409                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
 6410                            log.info(
 6411                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
 6412                            )
 6413
 6414                        log.info(
 6415                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
 6416                        )
 6417
 6418                    else:
 6419
 6420                        log.info(
 6421                            f"Annotation '{annotation_name}' - No Annotations available"
 6422                        )
 6423
 6424                    log.debug("Final header: " + str(vcf_reader.infos))
 6425
 6426        # Remove added columns
 6427        for added_column in added_columns:
 6428            self.drop_column(column=added_column)
 6429
 6430    def annotation_splice(self, threads: int = None) -> None:
 6431        """
 6432        This function annotate with snpEff
 6433
 6434        :param threads: The number of threads to use
 6435        :return: the value of the variable "return_value".
 6436        """
 6437
 6438        # DEBUG
 6439        log.debug("Start annotation with splice tools")
 6440
 6441        # Threads
 6442        if not threads:
 6443            threads = self.get_threads()
 6444        log.debug("Threads: " + str(threads))
 6445
 6446        # DEBUG
 6447        delete_tmp = True
 6448        if self.get_config().get("verbosity", "warning") in ["debug"]:
 6449            delete_tmp = False
 6450            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 6451
 6452        # Config
 6453        config = self.get_config()
 6454        log.debug("Config: " + str(config))
 6455        splice_config = config.get("tools", {}).get("splice", {})
 6456        if not splice_config:
 6457            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
 6458            msg_err = "No Splice tool config"
 6459            raise ValueError(msg_err)
 6460        log.debug(f"splice_config: {splice_config}")
 6461
 6462        # Config - Folders - Databases
 6463        databases_folders = (
 6464            config.get("folders", {}).get("databases", {}).get("splice", ["."])
 6465        )
 6466        log.debug("Databases annotations: " + str(databases_folders))
 6467
 6468        # Splice docker image
 6469        splice_docker_image = splice_config.get("docker").get("image")
 6470
 6471        # Pull splice image if it's not already there
 6472        if not check_docker_image_exists(splice_docker_image):
 6473            log.warning(
 6474                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
 6475            )
 6476            try:
 6477                command(f"docker pull {splice_config.get('docker').get('image')}")
 6478            except subprocess.CalledProcessError:
 6479                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
 6480                log.error(msg_err)
 6481                raise ValueError(msg_err)
 6482
 6483        # Config - splice databases
 6484        splice_databases = (
 6485            config.get("folders", {})
 6486            .get("databases", {})
 6487            .get("splice", DEFAULT_SPLICE_FOLDER)
 6488        )
 6489        splice_databases = full_path(splice_databases)
 6490
 6491        # Param
 6492        param = self.get_param()
 6493        log.debug("Param: " + str(param))
 6494
 6495        # Param
 6496        options = param.get("annotation", {}).get("splice", {}).get("options", {})
 6497        log.debug("Options: " + str(options))
 6498
 6499        # Data
 6500        table_variants = self.get_table_variants()
 6501
 6502        # Check if not empty
 6503        log.debug("Check if not empty")
 6504        sql_query_chromosomes = (
 6505            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 6506        )
 6507        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 6508            log.info("VCF empty")
 6509            return None
 6510
 6511        # Export in VCF
 6512        log.debug("Create initial file to annotate")
 6513
 6514        # Create output folder / work folder
 6515        if options.get("output_folder", ""):
 6516            output_folder = options.get("output_folder", "")
 6517            if not os.path.exists(output_folder):
 6518                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6519        else:
 6520            output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
 6521            if not os.path.exists(output_folder):
 6522                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6523
 6524        if options.get("workdir", ""):
 6525            workdir = options.get("workdir", "")
 6526        else:
 6527            workdir = "/work"
 6528
 6529        # Create tmp VCF file
 6530        tmp_vcf = NamedTemporaryFile(
 6531            prefix=self.get_prefix(),
 6532            dir=output_folder,
 6533            suffix=".vcf",
 6534            delete=False,
 6535        )
 6536        tmp_vcf_name = tmp_vcf.name
 6537
 6538        # VCF header
 6539        header = self.get_header()
 6540
 6541        # Existing annotations
 6542        for vcf_annotation in self.get_header().infos:
 6543
 6544            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 6545            log.debug(
 6546                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 6547            )
 6548
 6549        # Memory limit
 6550        if config.get("memory", None):
 6551            memory_limit = config.get("memory", "8G").upper()
 6552            # upper()
 6553        else:
 6554            memory_limit = "8G"
 6555        log.debug(f"memory_limit: {memory_limit}")
 6556
 6557        # Check number of variants to annotate
 6558        where_clause_regex_spliceai = r"SpliceAI_\w+"
 6559        where_clause_regex_spip = r"SPiP_\w+"
 6560        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
 6561        df_list_of_variants_to_annotate = self.get_query_to_df(
 6562            query=f""" SELECT * FROM variants {where_clause} """
 6563        )
 6564        if len(df_list_of_variants_to_annotate) == 0:
 6565            log.warning(
 6566                f"No variants to annotate with splice. Variants probably already annotated with splice"
 6567            )
 6568            return None
 6569        else:
 6570            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
 6571
 6572        # Export VCF file
 6573        self.export_variant_vcf(
 6574            vcf_file=tmp_vcf_name,
 6575            remove_info=True,
 6576            add_samples=True,
 6577            index=False,
 6578            where_clause=where_clause,
 6579        )
 6580        mount = [f" -v {path}:{path}:rw" for path in [output_folder]]
 6581        if any(value for value in splice_config.values() if value is None):
 6582            log.warning("At least one splice config parameter is empty")
 6583            # exit annotation_splice
 6584            return None
 6585
 6586        # Params in splice nf
 6587        def check_values(dico: dict):
 6588            """
 6589            Ensure parameters for NF splice pipeline
 6590            """
 6591            for key, val in dico.items():
 6592                if key == "genome":
 6593                    if any(
 6594                        assemb in options.get("genome", {})
 6595                        for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
 6596                    ):
 6597                        yield f"--{key} hg19"
 6598                    elif any(
 6599                        assemb in options.get("genome", {})
 6600                        for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
 6601                    ):
 6602                        yield f"--{key} hg38"
 6603                elif (
 6604                    (isinstance(val, str) and val)
 6605                    or isinstance(val, int)
 6606                    or isinstance(val, bool)
 6607                ):
 6608                    yield f"--{key} {val}"
 6609
 6610        # Genome
 6611        genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
 6612        options["genome"] = genome
 6613        # NF params
 6614        nf_params = []
 6615        # Add options
 6616        if options:
 6617            log.debug(options)
 6618            nf_params = list(check_values(options))
 6619            log.debug(f"Splice NF params: {' '.join(nf_params)}")
 6620        else:
 6621            log.debug("No NF params provided")
 6622        # Add threads
 6623        if "threads" not in options.keys():
 6624            nf_params.append(f"--threads {threads}")
 6625        # Genome path
 6626        genome_path = find_genome(
 6627            config.get("folders", {})
 6628            .get("databases", {})
 6629            .get("genomes", DEFAULT_GENOME_FOLDER),
 6630            file=f"{genome}.fa",
 6631        )
 6632        # Add genome path
 6633        if not genome_path:
 6634            raise ValueError(
 6635                f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
 6636            )
 6637        else:
 6638            log.debug(f"Genome: {genome_path}")
 6639            nf_params.append(f"--genome_path {genome_path}")
 6640
 6641        def splice_annotations(options: dict = {}, config: dict = {}) -> list:
 6642            """
 6643            Setting up updated databases for SPiP and SpliceAI
 6644            """
 6645
 6646            try:
 6647
 6648                # SpliceAI assembly transcriptome
 6649                spliceai_assembly = os.path.join(
 6650                    config.get("folders", {}).get("databases", {}).get("spliceai", {}),
 6651                    options.get("genome"),
 6652                    "transcriptome",
 6653                )
 6654                spip_assembly = options.get("genome")
 6655
 6656                spip = find(
 6657                    f"transcriptome_{spip_assembly}.RData",
 6658                    config.get("folders", {}).get("databases", {}).get("spip", {}),
 6659                )
 6660                spliceai = find("spliceai.refseq.txt", spliceai_assembly)
 6661                log.debug(f"SPiP annotations: {spip}")
 6662                log.debug(f"SpliceAI annotations: {spliceai}")
 6663                if spip and spliceai:
 6664                    return [
 6665                        f"--spip_transcriptome {spip}",
 6666                        f"--spliceai_transcriptome {spliceai}",
 6667                    ]
 6668                else:
 6669                    log.warning(
 6670                        "Can't find splice databases in configuration, use annotations file from image"
 6671                    )
 6672            except TypeError:
 6673                log.warning(
 6674                    "Can't find splice databases in configuration, use annotations file from image"
 6675                )
 6676                return []
 6677
 6678        # Add options, check if transcriptome option have already beend provided
 6679        if (
 6680            "spip_transcriptome" not in nf_params
 6681            and "spliceai_transcriptome" not in nf_params
 6682        ):
 6683            splice_reference = splice_annotations(options, config)
 6684            if splice_reference:
 6685                nf_params.extend(splice_reference)
 6686        # nf_params.append(f"--output_folder {output_folder}")
 6687        random_uuid = f"HOWARD-SPLICE-{get_random()}"
 6688        cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
 6689        log.debug(cmd)
 6690        splice_config["docker"]["command"] = cmd
 6691
 6692        # Ensure proxy is set
 6693        proxy = [
 6694            f"-e {var}={os.getenv(var)}"
 6695            for var in ["https_proxy", "http_proxy", "ftp_proxy"]
 6696            if os.getenv(var) is not None
 6697        ]
 6698        docker_cmd = get_bin_command(
 6699            tool="splice",
 6700            bin_type="docker",
 6701            config=config,
 6702            default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
 6703            add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}",
 6704        )
 6705        # print(docker_cmd)
 6706        # exit()
 6707        # Docker debug
 6708        # if splice_config.get("rm_container"):
 6709        #     rm_container = "--rm"
 6710        # else:
 6711        #     rm_container = ""
 6712        # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
 6713        log.debug(docker_cmd)
 6714        res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
 6715        log.debug(res.stdout)
 6716        if res.stderr:
 6717            log.error(res.stderr)
 6718        res.check_returncode()
 6719        # Update variants
 6720        log.info("Annotation - Updating...")
 6721        # Test find output vcf
 6722        log.debug(
 6723            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6724        )
 6725        output_vcf = []
 6726        # Wrong folder to look in
 6727        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
 6728            if (
 6729                files
 6730                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6731            ):
 6732                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
 6733        # log.debug(os.listdir(options.get("output_folder")))
 6734        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
 6735        if not output_vcf:
 6736            log.debug(
 6737                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
 6738            )
 6739        else:
 6740            # Get new header from annotated vcf
 6741            log.debug(f"Initial header: {len(header.infos)} fields")
 6742            # Create new header with splice infos
 6743            new_vcf = Variants(input=output_vcf[0])
 6744            new_vcf_header = new_vcf.get_header().infos
 6745            for keys, infos in new_vcf_header.items():
 6746                if keys not in header.infos.keys():
 6747                    header.infos[keys] = infos
 6748            log.debug(f"New header: {len(header.infos)} fields")
 6749            log.debug(f"Splice tmp output: {output_vcf[0]}")
 6750            self.update_from_vcf(output_vcf[0])
 6751
 6752        # Remove file
 6753        remove_if_exists(output_vcf)
 6754
 6755    ###
 6756    # Prioritization
 6757    ###
 6758
 6759    def get_config_default(self, name: str) -> dict:
 6760        """
 6761        The function `get_config_default` returns a dictionary containing default configurations for
 6762        various calculations and prioritizations.
 6763
 6764        :param name: The `get_config_default` function returns a dictionary containing default
 6765        configurations for different calculations and prioritizations. The `name` parameter is used to
 6766        specify which specific configuration to retrieve from the dictionary
 6767        :type name: str
 6768        :return: The function `get_config_default` returns a dictionary containing default configuration
 6769        settings for different calculations and prioritizations. The specific configuration settings are
 6770        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
 6771        matches a key in the `config_default` dictionary, the corresponding configuration settings are
 6772        returned. If there is no match, an empty dictionary is returned.
 6773        """
 6774
 6775        config_default = {
 6776            "calculations": {
 6777                "variant_chr_pos_alt_ref": {
 6778                    "type": "sql",
 6779                    "name": "variant_chr_pos_alt_ref",
 6780                    "description": "Create a variant ID with chromosome, position, alt and ref",
 6781                    "available": False,
 6782                    "output_column_name": "variant_chr_pos_alt_ref",
 6783                    "output_column_type": "String",
 6784                    "output_column_description": "variant ID with chromosome, position, alt and ref",
 6785                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
 6786                    "operation_info": True,
 6787                },
 6788                "VARTYPE": {
 6789                    "type": "sql",
 6790                    "name": "VARTYPE",
 6791                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
 6792                    "available": True,
 6793                    "table": "variants",
 6794                    "output_column_name": "VARTYPE",
 6795                    "output_column_type": "String",
 6796                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
 6797                    "operation_query": """
 6798                            CASE
 6799                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
 6800                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
 6801                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
 6802                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
 6803                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
 6804                                ELSE 'UNDEFINED'
 6805                            END
 6806                            """,
 6807                    "info_fields": ["SVTYPE"],
 6808                    "operation_info": True,
 6809                },
 6810                "snpeff_hgvs": {
 6811                    "type": "python",
 6812                    "name": "snpeff_hgvs",
 6813                    "description": "HGVS nomenclatures from snpEff annotation",
 6814                    "available": True,
 6815                    "function_name": "calculation_extract_snpeff_hgvs",
 6816                    "function_params": ["snpeff_hgvs", "ANN"],
 6817                },
 6818                "snpeff_ann_explode": {
 6819                    "type": "python",
 6820                    "name": "snpeff_ann_explode",
 6821                    "description": "Explode snpEff annotations with uniquify values",
 6822                    "available": True,
 6823                    "function_name": "calculation_snpeff_ann_explode",
 6824                    "function_params": [False, "fields", "snpeff_", "ANN"],
 6825                },
 6826                "snpeff_ann_explode_uniquify": {
 6827                    "type": "python",
 6828                    "name": "snpeff_ann_explode_uniquify",
 6829                    "description": "Explode snpEff annotations",
 6830                    "available": True,
 6831                    "function_name": "calculation_snpeff_ann_explode",
 6832                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
 6833                },
 6834                "snpeff_ann_explode_json": {
 6835                    "type": "python",
 6836                    "name": "snpeff_ann_explode_json",
 6837                    "description": "Explode snpEff annotations in JSON format",
 6838                    "available": True,
 6839                    "function_name": "calculation_snpeff_ann_explode",
 6840                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
 6841                },
 6842                "NOMEN": {
 6843                    "type": "python",
 6844                    "name": "NOMEN",
 6845                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)",
 6846                    "available": True,
 6847                    "function_name": "calculation_extract_nomen",
 6848                    "function_params": [],
 6849                },
 6850                "RENAME_INFO_FIELDS": {
 6851                    "type": "python",
 6852                    "name": "RENAME_INFO_FIELDS",
 6853                    "description": "Rename or remove INFO/tags",
 6854                    "available": True,
 6855                    "function_name": "calculation_rename_info_fields",
 6856                    "function_params": [],
 6857                },
 6858                "FINDBYPIPELINE": {
 6859                    "type": "python",
 6860                    "name": "FINDBYPIPELINE",
 6861                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
 6862                    "available": True,
 6863                    "function_name": "calculation_find_by_pipeline",
 6864                    "function_params": ["findbypipeline"],
 6865                },
 6866                "FINDBYSAMPLE": {
 6867                    "type": "python",
 6868                    "name": "FINDBYSAMPLE",
 6869                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
 6870                    "available": True,
 6871                    "function_name": "calculation_find_by_pipeline",
 6872                    "function_params": ["findbysample"],
 6873                },
 6874                "GENOTYPECONCORDANCE": {
 6875                    "type": "python",
 6876                    "name": "GENOTYPECONCORDANCE",
 6877                    "description": "Concordance of genotype for multi caller VCF",
 6878                    "available": True,
 6879                    "function_name": "calculation_genotype_concordance",
 6880                    "function_params": [],
 6881                },
 6882                "BARCODE": {
 6883                    "type": "python",
 6884                    "name": "BARCODE",
 6885                    "description": "BARCODE as VaRank tool",
 6886                    "available": True,
 6887                    "function_name": "calculation_barcode",
 6888                    "function_params": [],
 6889                },
 6890                "BARCODEFAMILY": {
 6891                    "type": "python",
 6892                    "name": "BARCODEFAMILY",
 6893                    "description": "BARCODEFAMILY as VaRank tool",
 6894                    "available": True,
 6895                    "function_name": "calculation_barcode_family",
 6896                    "function_params": ["BCF"],
 6897                },
 6898                "TRIO": {
 6899                    "type": "python",
 6900                    "name": "TRIO",
 6901                    "description": "Inheritance for a trio family",
 6902                    "available": True,
 6903                    "function_name": "calculation_trio",
 6904                    "function_params": [],
 6905                },
 6906                "VAF": {
 6907                    "type": "python",
 6908                    "name": "VAF",
 6909                    "description": "Variant Allele Frequency (VAF) harmonization",
 6910                    "available": True,
 6911                    "function_name": "calculation_vaf_normalization",
 6912                    "function_params": [],
 6913                },
 6914                "VAF_stats": {
 6915                    "type": "python",
 6916                    "name": "VAF_stats",
 6917                    "description": "Variant Allele Frequency (VAF) statistics",
 6918                    "available": True,
 6919                    "function_name": "calculation_genotype_stats",
 6920                    "function_params": ["VAF"],
 6921                },
 6922                "DP_stats": {
 6923                    "type": "python",
 6924                    "name": "DP_stats",
 6925                    "description": "Depth (DP) statistics",
 6926                    "available": True,
 6927                    "function_name": "calculation_genotype_stats",
 6928                    "function_params": ["DP"],
 6929                },
 6930                "variant_id": {
 6931                    "type": "python",
 6932                    "name": "variant_id",
 6933                    "description": "Variant ID generated from variant position and type",
 6934                    "available": True,
 6935                    "function_name": "calculation_variant_id",
 6936                    "function_params": [],
 6937                },
 6938                "transcripts_json": {
 6939                    "type": "python",
 6940                    "name": "transcripts_json",
 6941                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
 6942                    "available": True,
 6943                    "function_name": "calculation_transcripts_annotation",
 6944                    "function_params": ["transcripts_json", None],
 6945                },
 6946                "transcripts_ann": {
 6947                    "type": "python",
 6948                    "name": "transcripts_ann",
 6949                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
 6950                    "available": True,
 6951                    "function_name": "calculation_transcripts_annotation",
 6952                    "function_params": [None, "transcripts_ann"],
 6953                },
 6954                "transcripts_annotations": {
 6955                    "type": "python",
 6956                    "name": "transcripts_annotations",
 6957                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
 6958                    "available": True,
 6959                    "function_name": "calculation_transcripts_annotation",
 6960                    "function_params": [None, None],
 6961                },
 6962                "transcripts_prioritization": {
 6963                    "type": "python",
 6964                    "name": "transcripts_prioritization",
 6965                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
 6966                    "available": True,
 6967                    "function_name": "calculation_transcripts_prioritization",
 6968                    "function_params": [],
 6969                },
 6970                "transcripts_export": {
 6971                    "type": "python",
 6972                    "name": "transcripts_export",
 6973                    "description": "Export transcripts table/view as a file (using param.json)",
 6974                    "available": True,
 6975                    "function_name": "calculation_transcripts_export",
 6976                    "function_params": [],
 6977                },
 6978            },
 6979            "prioritizations": {
 6980                "default": {
 6981                    "ANN2": [
 6982                        {
 6983                            "type": "contains",
 6984                            "value": "HIGH",
 6985                            "score": 5,
 6986                            "flag": "PASS",
 6987                            "comment": [
 6988                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
 6989                            ],
 6990                        },
 6991                        {
 6992                            "type": "contains",
 6993                            "value": "MODERATE",
 6994                            "score": 3,
 6995                            "flag": "PASS",
 6996                            "comment": [
 6997                                "A non-disruptive variant that might change protein effectiveness"
 6998                            ],
 6999                        },
 7000                        {
 7001                            "type": "contains",
 7002                            "value": "LOW",
 7003                            "score": 0,
 7004                            "flag": "FILTERED",
 7005                            "comment": [
 7006                                "Assumed to be mostly harmless or unlikely to change protein behavior"
 7007                            ],
 7008                        },
 7009                        {
 7010                            "type": "contains",
 7011                            "value": "MODIFIER",
 7012                            "score": 0,
 7013                            "flag": "FILTERED",
 7014                            "comment": [
 7015                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
 7016                            ],
 7017                        },
 7018                    ],
 7019                }
 7020            },
 7021        }
 7022
 7023        return config_default.get(name, None)
 7024
 7025    def get_config_json(
 7026        self, name: str, config_dict: dict = {}, config_file: str = None
 7027    ) -> dict:
 7028        """
 7029        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
 7030        default values, a dictionary, and a file.
 7031
 7032        :param name: The `name` parameter in the `get_config_json` function is a string that represents
 7033        the name of the configuration. It is used to identify and retrieve the configuration settings
 7034        for a specific component or module
 7035        :type name: str
 7036        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
 7037        dictionary that allows you to provide additional configuration settings or overrides. When you
 7038        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
 7039        the key is the configuration setting you want to override or
 7040        :type config_dict: dict
 7041        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
 7042        specify the path to a configuration file that contains additional settings. If provided, the
 7043        function will read the contents of this file and update the configuration dictionary with the
 7044        values found in the file, overriding any existing values with the
 7045        :type config_file: str
 7046        :return: The function `get_config_json` returns a dictionary containing the configuration
 7047        settings.
 7048        """
 7049
 7050        # Create with default prioritizations
 7051        config_default = self.get_config_default(name=name)
 7052        configuration = config_default
 7053        # log.debug(f"configuration={configuration}")
 7054
 7055        # Replace prioritizations from dict
 7056        for config in config_dict:
 7057            configuration[config] = config_dict[config]
 7058
 7059        # Replace prioritizations from file
 7060        config_file = full_path(config_file)
 7061        if config_file:
 7062            if os.path.exists(config_file):
 7063                with open(config_file) as config_file_content:
 7064                    config_file_dict = yaml.safe_load(config_file_content)
 7065                for config in config_file_dict:
 7066                    configuration[config] = config_file_dict[config]
 7067            else:
 7068                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
 7069                log.error(msg_error)
 7070                raise ValueError(msg_error)
 7071
 7072        return configuration
 7073
 7074    def prioritization(
 7075        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
 7076    ) -> bool:
 7077        """
 7078        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
 7079        prioritizes variants based on configured profiles and criteria.
 7080
 7081        :param table: The `table` parameter in the `prioritization` function is used to specify the name
 7082        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
 7083        a table name is provided, the method will prioritize the variants in that specific table
 7084        :type table: str
 7085        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
 7086        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
 7087        provided, the code will use a default prefix value of "PZ"
 7088        :type pz_prefix: str
 7089        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
 7090        additional parameters specific to the prioritization process. These parameters can include
 7091        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
 7092        configurations needed for the prioritization of variants in a V
 7093        :type pz_param: dict
 7094        :return: A boolean value (True) is being returned from the `prioritization` function.
 7095        """
 7096
 7097        # Config
 7098        config = self.get_config()
 7099
 7100        # Param
 7101        param = self.get_param()
 7102
 7103        # Prioritization param
 7104        if pz_param is not None:
 7105            prioritization_param = pz_param
 7106        else:
 7107            prioritization_param = param.get("prioritization", {})
 7108
 7109        # Configuration profiles
 7110        prioritization_config_file = prioritization_param.get(
 7111            "prioritization_config", None
 7112        )
 7113        prioritization_config_file = full_path(prioritization_config_file)
 7114        prioritizations_config = self.get_config_json(
 7115            name="prioritizations", config_file=prioritization_config_file
 7116        )
 7117
 7118        # Prioritization prefix
 7119        pz_prefix_default = "PZ"
 7120        if pz_prefix is None:
 7121            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
 7122
 7123        # Prioritization options
 7124        profiles = prioritization_param.get("profiles", [])
 7125        if isinstance(profiles, str):
 7126            profiles = profiles.split(",")
 7127        pzfields = prioritization_param.get(
 7128            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
 7129        )
 7130        if isinstance(pzfields, str):
 7131            pzfields = pzfields.split(",")
 7132        default_profile = prioritization_param.get("default_profile", None)
 7133        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
 7134        prioritization_score_mode = prioritization_param.get(
 7135            "prioritization_score_mode", "HOWARD"
 7136        )
 7137
 7138        # Quick Prioritizations
 7139        prioritizations = param.get("prioritizations", None)
 7140        if prioritizations:
 7141            log.info("Quick Prioritization:")
 7142            for profile in prioritizations.split(","):
 7143                if profile not in profiles:
 7144                    profiles.append(profile)
 7145                    log.info(f"   {profile}")
 7146
 7147        # If profile "ALL" provided, all profiles in the config profiles
 7148        if "ALL" in profiles:
 7149            profiles = list(prioritizations_config.keys())
 7150
 7151        for profile in profiles:
 7152            if prioritizations_config.get(profile, None):
 7153                log.debug(f"Profile '{profile}' configured")
 7154            else:
 7155                msg_error = f"Profile '{profile}' NOT configured"
 7156                log.error(msg_error)
 7157                raise ValueError(msg_error)
 7158
 7159        if profiles:
 7160            log.info(f"Prioritization... ")
 7161        else:
 7162            log.debug(f"No profile defined")
 7163            return False
 7164
 7165        if not default_profile and len(profiles):
 7166            default_profile = profiles[0]
 7167
 7168        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
 7169        log.debug("Profiles to check: " + str(list(profiles)))
 7170
 7171        # Variables
 7172        if table is not None:
 7173            table_variants = table
 7174        else:
 7175            table_variants = self.get_table_variants(clause="update")
 7176        log.debug(f"Table to prioritize: {table_variants}")
 7177
 7178        # Added columns
 7179        added_columns = []
 7180
 7181        # Create list of PZfields
 7182        # List of PZFields
 7183        list_of_pzfields_original = pzfields + [
 7184            pzfield + pzfields_sep + profile
 7185            for pzfield in pzfields
 7186            for profile in profiles
 7187        ]
 7188        list_of_pzfields = []
 7189        log.debug(f"{list_of_pzfields_original}")
 7190
 7191        # Remove existing PZfields to use if exists
 7192        for pzfield in list_of_pzfields_original:
 7193            if self.get_header().infos.get(pzfield, None) is None:
 7194                list_of_pzfields.append(pzfield)
 7195                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
 7196            else:
 7197                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
 7198
 7199        if list_of_pzfields:
 7200
 7201            # Explode Infos prefix
 7202            explode_infos_prefix = self.get_explode_infos_prefix()
 7203
 7204            # PZfields tags description
 7205            PZfields_INFOS = {
 7206                f"{pz_prefix}Tags": {
 7207                    "ID": f"{pz_prefix}Tags",
 7208                    "Number": ".",
 7209                    "Type": "String",
 7210                    "Description": "Variant tags based on annotation criteria",
 7211                },
 7212                f"{pz_prefix}Score": {
 7213                    "ID": f"{pz_prefix}Score",
 7214                    "Number": 1,
 7215                    "Type": "Integer",
 7216                    "Description": "Variant score based on annotation criteria",
 7217                },
 7218                f"{pz_prefix}Flag": {
 7219                    "ID": f"{pz_prefix}Flag",
 7220                    "Number": 1,
 7221                    "Type": "String",
 7222                    "Description": "Variant flag based on annotation criteria",
 7223                },
 7224                f"{pz_prefix}Comment": {
 7225                    "ID": f"{pz_prefix}Comment",
 7226                    "Number": ".",
 7227                    "Type": "String",
 7228                    "Description": "Variant comment based on annotation criteria",
 7229                },
 7230                f"{pz_prefix}Infos": {
 7231                    "ID": f"{pz_prefix}Infos",
 7232                    "Number": ".",
 7233                    "Type": "String",
 7234                    "Description": "Variant infos based on annotation criteria",
 7235                },
 7236                f"{pz_prefix}Class": {
 7237                    "ID": f"{pz_prefix}Class",
 7238                    "Number": ".",
 7239                    "Type": "String",
 7240                    "Description": "Variant class based on annotation criteria",
 7241                },
 7242            }
 7243
 7244            # Create INFO fields if not exist
 7245            for field in PZfields_INFOS:
 7246                field_ID = PZfields_INFOS[field]["ID"]
 7247                field_description = PZfields_INFOS[field]["Description"]
 7248                if field_ID not in self.get_header().infos and field_ID in pzfields:
 7249                    field_description = (
 7250                        PZfields_INFOS[field]["Description"]
 7251                        + f", profile {default_profile}"
 7252                    )
 7253                    self.get_header().infos[field_ID] = vcf.parser._Info(
 7254                        field_ID,
 7255                        PZfields_INFOS[field]["Number"],
 7256                        PZfields_INFOS[field]["Type"],
 7257                        field_description,
 7258                        "unknown",
 7259                        "unknown",
 7260                        code_type_map[PZfields_INFOS[field]["Type"]],
 7261                    )
 7262
 7263            # Create INFO fields if not exist for each profile
 7264            for profile in prioritizations_config:
 7265                if profile in profiles or profiles == []:
 7266                    for field in PZfields_INFOS:
 7267                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
 7268                        field_description = (
 7269                            PZfields_INFOS[field]["Description"]
 7270                            + f", profile {profile}"
 7271                        )
 7272                        if (
 7273                            field_ID not in self.get_header().infos
 7274                            and field in pzfields
 7275                        ):
 7276                            self.get_header().infos[field_ID] = vcf.parser._Info(
 7277                                field_ID,
 7278                                PZfields_INFOS[field]["Number"],
 7279                                PZfields_INFOS[field]["Type"],
 7280                                field_description,
 7281                                "unknown",
 7282                                "unknown",
 7283                                code_type_map[PZfields_INFOS[field]["Type"]],
 7284                            )
 7285
 7286            # Header
 7287            for pzfield in list_of_pzfields:
 7288                if re.match(f"{pz_prefix}Score.*", pzfield):
 7289                    added_column = self.add_column(
 7290                        table_name=table_variants,
 7291                        column_name=pzfield,
 7292                        column_type="INTEGER",
 7293                        default_value="0",
 7294                    )
 7295                elif re.match(f"{pz_prefix}Flag.*", pzfield):
 7296                    added_column = self.add_column(
 7297                        table_name=table_variants,
 7298                        column_name=pzfield,
 7299                        column_type="BOOLEAN",
 7300                        default_value="1",
 7301                    )
 7302                elif re.match(f"{pz_prefix}Class.*", pzfield):
 7303                    added_column = self.add_column(
 7304                        table_name=table_variants,
 7305                        column_name=pzfield,
 7306                        column_type="VARCHAR[]",
 7307                        default_value="null",
 7308                    )
 7309                else:
 7310                    added_column = self.add_column(
 7311                        table_name=table_variants,
 7312                        column_name=pzfield,
 7313                        column_type="STRING",
 7314                        default_value="''",
 7315                    )
 7316                added_columns.append(added_column)
 7317
 7318            # Profiles
 7319            if profiles:
 7320
 7321                # foreach profile in configuration file
 7322                for profile in prioritizations_config:
 7323
 7324                    # If profile is asked in param, or ALL are asked (empty profile [])
 7325                    if profile in profiles or profiles == []:
 7326                        log.info(f"Profile '{profile}'")
 7327
 7328                        sql_set_info_option = ""
 7329
 7330                        sql_set_info = []
 7331
 7332                        # PZ fields set
 7333
 7334                        # PZScore
 7335                        if (
 7336                            f"{pz_prefix}Score{pzfields_sep}{profile}"
 7337                            in list_of_pzfields
 7338                        ):
 7339                            sql_set_info.append(
 7340                                f"""
 7341                                    concat(
 7342                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
 7343                                        {pz_prefix}Score{pzfields_sep}{profile}
 7344                                    ) 
 7345                                """
 7346                            )
 7347                            if (
 7348                                profile == default_profile
 7349                                and f"{pz_prefix}Score" in list_of_pzfields
 7350                            ):
 7351                                sql_set_info.append(
 7352                                    f"""
 7353                                        concat(
 7354                                            '{pz_prefix}Score=',
 7355                                            {pz_prefix}Score{pzfields_sep}{profile}
 7356                                        )
 7357                                    """
 7358                                )
 7359
 7360                        # PZFlag
 7361                        if (
 7362                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7363                            in list_of_pzfields
 7364                        ):
 7365                            sql_set_info.append(
 7366                                f"""
 7367                                    concat(
 7368                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
 7369                                        CASE 
 7370                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7371                                            THEN 'PASS'
 7372                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7373                                            THEN 'FILTERED'
 7374                                        END
 7375                                    ) 
 7376                                """
 7377                            )
 7378                            if (
 7379                                profile == default_profile
 7380                                and f"{pz_prefix}Flag" in list_of_pzfields
 7381                            ):
 7382                                sql_set_info.append(
 7383                                    f"""
 7384                                        concat(
 7385                                            '{pz_prefix}Flag=',
 7386                                            CASE 
 7387                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7388                                                THEN 'PASS'
 7389                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7390                                                THEN 'FILTERED'
 7391                                            END
 7392                                        )
 7393                                    """
 7394                                )
 7395
 7396                        # PZClass
 7397                        if (
 7398                            f"{pz_prefix}Class{pzfields_sep}{profile}"
 7399                            in list_of_pzfields
 7400                        ):
 7401                            sql_set_info.append(
 7402                                f"""
 7403                                    concat(
 7404                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
 7405                                        CASE
 7406                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7407                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7408                                            ELSE '.'
 7409                                        END 
 7410                                    )
 7411                                    
 7412                                """
 7413                            )
 7414                            if (
 7415                                profile == default_profile
 7416                                and f"{pz_prefix}Class" in list_of_pzfields
 7417                            ):
 7418                                sql_set_info.append(
 7419                                    f"""
 7420                                        concat(
 7421                                            '{pz_prefix}Class=',
 7422                                            CASE
 7423                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7424                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7425                                                ELSE '.'
 7426                                            END 
 7427                                        )
 7428                                    """
 7429                                )
 7430
 7431                        # PZComment
 7432                        if (
 7433                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7434                            in list_of_pzfields
 7435                        ):
 7436                            sql_set_info.append(
 7437                                f"""
 7438                                    CASE
 7439                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7440                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
 7441                                        ELSE ''
 7442                                    END
 7443                                """
 7444                            )
 7445                            if (
 7446                                profile == default_profile
 7447                                and f"{pz_prefix}Comment" in list_of_pzfields
 7448                            ):
 7449                                sql_set_info.append(
 7450                                    f"""
 7451                                        CASE
 7452                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7453                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
 7454                                            ELSE ''
 7455                                        END
 7456                                    """
 7457                                )
 7458
 7459                        # PZInfos
 7460                        if (
 7461                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7462                            in list_of_pzfields
 7463                        ):
 7464                            sql_set_info.append(
 7465                                f"""
 7466                                    CASE
 7467                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7468                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
 7469                                        ELSE ''
 7470                                    END
 7471                                """
 7472                            )
 7473                            if (
 7474                                profile == default_profile
 7475                                and f"{pz_prefix}Infos" in list_of_pzfields
 7476                            ):
 7477                                sql_set_info.append(
 7478                                    f"""
 7479                                        CASE
 7480                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7481                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
 7482                                            ELSE ''
 7483                                        END
 7484                                    """
 7485                                )
 7486
 7487                        # Merge PZfields
 7488                        sql_set_info_option = ""
 7489                        sql_set_sep = ""
 7490                        for sql_set in sql_set_info:
 7491                            if sql_set_sep:
 7492                                sql_set_info_option += f"""
 7493                                    , concat('{sql_set_sep}', {sql_set})
 7494                                """
 7495                            else:
 7496                                sql_set_info_option += f"""
 7497                                    , {sql_set}
 7498                                """
 7499                            sql_set_sep = ";"
 7500
 7501                        sql_queries = []
 7502                        for annotation in prioritizations_config[profile]:
 7503
 7504                            # skip special sections
 7505                            if annotation.startswith("_"):
 7506                                continue
 7507
 7508                            # For each criterions
 7509                            for criterion in prioritizations_config[profile][
 7510                                annotation
 7511                            ]:
 7512
 7513                                # Criterion mode
 7514                                criterion_mode = None
 7515                                if np.any(
 7516                                    np.isin(list(criterion.keys()), ["type", "value"])
 7517                                ):
 7518                                    criterion_mode = "operation"
 7519                                elif np.any(
 7520                                    np.isin(list(criterion.keys()), ["sql", "fields"])
 7521                                ):
 7522                                    criterion_mode = "sql"
 7523                                log.debug(f"Criterion Mode: {criterion_mode}")
 7524
 7525                                # Criterion parameters
 7526                                criterion_type = criterion.get("type", None)
 7527                                criterion_value = criterion.get("value", None)
 7528                                criterion_sql = criterion.get("sql", None)
 7529                                criterion_fields = criterion.get("fields", None)
 7530                                criterion_score = criterion.get("score", 0)
 7531                                criterion_flag = criterion.get("flag", "PASS")
 7532                                criterion_class = criterion.get("class", None)
 7533                                criterion_flag_bool = criterion_flag == "PASS"
 7534                                criterion_comment = (
 7535                                    ", ".join(criterion.get("comment", []))
 7536                                    .replace("'", "''")
 7537                                    .replace(";", ",")
 7538                                    .replace("\t", " ")
 7539                                )
 7540                                criterion_infos = (
 7541                                    str(criterion)
 7542                                    .replace("'", "''")
 7543                                    .replace(";", ",")
 7544                                    .replace("\t", " ")
 7545                                )
 7546
 7547                                # SQL
 7548                                if criterion_sql is not None and isinstance(
 7549                                    criterion_sql, list
 7550                                ):
 7551                                    criterion_sql = " ".join(criterion_sql)
 7552
 7553                                # Fields and explode
 7554                                if criterion_fields is None:
 7555                                    criterion_fields = [annotation]
 7556                                if not isinstance(criterion_fields, list):
 7557                                    criterion_fields = str(criterion_fields).split(",")
 7558
 7559                                # Class
 7560                                if criterion_class is not None and not isinstance(
 7561                                    criterion_class, list
 7562                                ):
 7563                                    criterion_class = str(criterion_class).split(",")
 7564
 7565                                for annotation_field in criterion_fields:
 7566
 7567                                    # Explode specific annotation
 7568                                    log.debug(
 7569                                        f"Explode annotation '{annotation_field}'"
 7570                                    )
 7571                                    added_columns += self.explode_infos(
 7572                                        prefix=explode_infos_prefix,
 7573                                        fields=[annotation_field],
 7574                                        table=table_variants,
 7575                                    )
 7576                                    extra_infos = self.get_extra_infos(
 7577                                        table=table_variants
 7578                                    )
 7579
 7580                                    # Check if annotation field is present
 7581                                    if (
 7582                                        f"{explode_infos_prefix}{annotation_field}"
 7583                                        not in extra_infos
 7584                                    ):
 7585                                        msq_err = f"Annotation '{annotation_field}' not in data"
 7586                                        log.error(msq_err)
 7587                                        raise ValueError(msq_err)
 7588                                    else:
 7589                                        log.debug(
 7590                                            f"Annotation '{annotation_field}' in data"
 7591                                        )
 7592
 7593                                sql_set = []
 7594                                sql_set_info = []
 7595
 7596                                # PZ fields set
 7597
 7598                                # PZScore
 7599                                if (
 7600                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
 7601                                    in list_of_pzfields
 7602                                ):
 7603                                    # VaRank prioritization score mode
 7604                                    if prioritization_score_mode.upper().strip() in [
 7605                                        "VARANK",
 7606                                        "MAX",
 7607                                        "MAXIMUM",
 7608                                        "TOP",
 7609                                    ]:
 7610                                        sql_set.append(
 7611                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END "
 7612                                        )
 7613                                    # default HOWARD prioritization score mode
 7614                                    else:
 7615                                        sql_set.append(
 7616                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7617                                        )
 7618
 7619                                # PZFlag
 7620                                if (
 7621                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7622                                    in list_of_pzfields
 7623                                ):
 7624                                    sql_set.append(
 7625                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
 7626                                    )
 7627
 7628                                # PZClass
 7629                                if (
 7630                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
 7631                                    in list_of_pzfields
 7632                                    and criterion_class is not None
 7633                                ):
 7634                                    sql_set.append(
 7635                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
 7636                                    )
 7637
 7638                                # PZComment
 7639                                if (
 7640                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7641                                    in list_of_pzfields
 7642                                ):
 7643                                    sql_set.append(
 7644                                        f"""
 7645                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
 7646                                                concat(
 7647                                                    {pz_prefix}Comment{pzfields_sep}{profile},
 7648                                                    CASE 
 7649                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
 7650                                                        THEN ', '
 7651                                                        ELSE ''
 7652                                                    END,
 7653                                                    '{criterion_comment}'
 7654                                                )
 7655                                        """
 7656                                    )
 7657
 7658                                # PZInfos
 7659                                if (
 7660                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7661                                    in list_of_pzfields
 7662                                ):
 7663                                    sql_set.append(
 7664                                        f"""
 7665                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
 7666                                                concat(
 7667                                                    {pz_prefix}Infos{pzfields_sep}{profile},
 7668                                                    '{criterion_infos}'
 7669                                                )
 7670                                        """
 7671                                    )
 7672                                sql_set_option = ",".join(sql_set)
 7673
 7674                                # Criterion and comparison
 7675                                if sql_set_option:
 7676
 7677                                    if criterion_mode in ["operation"]:
 7678
 7679                                        try:
 7680                                            float(criterion_value)
 7681                                            sql_update = f"""
 7682                                                UPDATE {table_variants}
 7683                                                SET {sql_set_option}
 7684                                                WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
 7685                                                AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
 7686                                            """
 7687                                        except:
 7688                                            contains_option = ""
 7689                                            if criterion_type == "contains":
 7690                                                contains_option = ".*"
 7691                                            sql_update = f"""
 7692                                                UPDATE {table_variants}
 7693                                                SET {sql_set_option}
 7694                                                WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
 7695                                            """
 7696                                        sql_queries.append(sql_update)
 7697
 7698                                    elif criterion_mode in ["sql"]:
 7699
 7700                                        sql_update = f"""
 7701                                            UPDATE {table_variants}
 7702                                            SET {sql_set_option}
 7703                                            WHERE {criterion_sql}
 7704                                        """
 7705                                        sql_queries.append(sql_update)
 7706
 7707                                    else:
 7708                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
 7709                                        log.error(msg_err)
 7710                                        raise ValueError(msg_err)
 7711
 7712                                else:
 7713                                    log.warning(
 7714                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
 7715                                    )
 7716
 7717                        # PZTags
 7718                        if (
 7719                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
 7720                            in list_of_pzfields
 7721                        ):
 7722
 7723                            # Create PZFalgs value
 7724                            pztags_value = ""
 7725                            pztags_sep_default = ","
 7726                            pztags_sep = ""
 7727                            for pzfield in pzfields:
 7728                                if pzfield not in [f"{pz_prefix}Tags"]:
 7729                                    if (
 7730                                        f"{pzfield}{pzfields_sep}{profile}"
 7731                                        in list_of_pzfields
 7732                                    ):
 7733                                        if pzfield in [f"{pz_prefix}Flag"]:
 7734                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7735                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
 7736                                                    THEN 'PASS'
 7737                                                    ELSE 'FILTERED'
 7738                                                END, '"""
 7739                                        elif pzfield in [f"{pz_prefix}Class"]:
 7740                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7741                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7742                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7743                                                    ELSE '.'
 7744                                                END, '"""
 7745                                        else:
 7746                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
 7747                                        pztags_sep = pztags_sep_default
 7748
 7749                            # Add Query update for PZFlags
 7750                            sql_update_pztags = f"""
 7751                                UPDATE {table_variants}
 7752                                SET INFO = concat(
 7753                                        INFO,
 7754                                        CASE WHEN INFO NOT in ('','.')
 7755                                                THEN ';'
 7756                                                ELSE ''
 7757                                        END,
 7758                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
 7759                                    )
 7760                                """
 7761                            sql_queries.append(sql_update_pztags)
 7762
 7763                            # Add Query update for PZFlags for default
 7764                            if profile == default_profile:
 7765                                sql_update_pztags_default = f"""
 7766                                UPDATE {table_variants}
 7767                                SET INFO = concat(
 7768                                        INFO,
 7769                                        ';',
 7770                                        '{pz_prefix}Tags={pztags_value}'
 7771                                    )
 7772                                """
 7773                                sql_queries.append(sql_update_pztags_default)
 7774
 7775                        log.info(f"""Profile '{profile}' - Prioritization... """)
 7776
 7777                        if sql_queries:
 7778
 7779                            for sql_query in sql_queries:
 7780                                log.debug(
 7781                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
 7782                                )
 7783                                self.conn.execute(sql_query)
 7784
 7785                        log.info(f"""Profile '{profile}' - Update... """)
 7786                        sql_query_update = f"""
 7787                            UPDATE {table_variants}
 7788                            SET INFO =  
 7789                                concat(
 7790                                    CASE
 7791                                        WHEN INFO NOT IN ('','.')
 7792                                        THEN concat(INFO, ';')
 7793                                        ELSE ''
 7794                                    END
 7795                                    {sql_set_info_option}
 7796                                )
 7797                        """
 7798                        self.conn.execute(sql_query_update)
 7799
 7800        else:
 7801
 7802            log.warning(f"No profiles in parameters")
 7803
 7804        # Remove added columns
 7805        for added_column in added_columns:
 7806            self.drop_column(column=added_column)
 7807
 7808        # Explode INFOS fields into table fields
 7809        if self.get_explode_infos():
 7810            self.explode_infos(
 7811                prefix=self.get_explode_infos_prefix(),
 7812                fields=self.get_explode_infos_fields(),
 7813                force=True,
 7814            )
 7815
 7816        return True
 7817
 7818    ###
 7819    # HGVS
 7820    ###
 7821
 7822    def annotation_hgvs(self, threads: int = None) -> None:
 7823        """
 7824        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
 7825        coordinates and alleles.
 7826
 7827        :param threads: The `threads` parameter is an optional integer that specifies the number of
 7828        threads to use for parallel processing. If no value is provided, it will default to the number
 7829        of threads obtained from the `get_threads()` method
 7830        :type threads: int
 7831        """
 7832
 7833        # Function for each partition of the Dask Dataframe
 7834        def partition_function(partition):
 7835            """
 7836            The function `partition_function` applies the `annotation_hgvs_partition` function to
 7837            each row of a DataFrame called `partition`.
 7838
 7839            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
 7840            to be processed
 7841            :return: the result of applying the "annotation_hgvs_partition" function to each row of
 7842            the "partition" dataframe along the axis 1.
 7843            """
 7844            return partition.apply(annotation_hgvs_partition, axis=1)
 7845
 7846        def annotation_hgvs_partition(row) -> str:
 7847            """
 7848            The function `annotation_hgvs_partition` takes in a row of data and returns a string
 7849            containing a list of HGVS names associated with the given genomic coordinates and alleles.
 7850
 7851            :param row: A dictionary-like object that contains the values for the following keys:
 7852            :return: a string that contains the HGVS names associated with the given row of data.
 7853            """
 7854
 7855            chr = row["CHROM"]
 7856            pos = row["POS"]
 7857            ref = row["REF"]
 7858            alt = row["ALT"]
 7859
 7860            # Find list of associated transcripts
 7861            transcripts_list = list(
 7862                polars_conn.execute(
 7863                    f"""
 7864                SELECT transcript
 7865                FROM refseq_df
 7866                WHERE CHROM='{chr}'
 7867                AND POS={pos}
 7868            """
 7869                )["transcript"]
 7870            )
 7871
 7872            # Full HGVS annotation in list
 7873            hgvs_full_list = []
 7874
 7875            for transcript_name in transcripts_list:
 7876
 7877                # Transcript
 7878                transcript = get_transcript(
 7879                    transcripts=transcripts, transcript_name=transcript_name
 7880                )
 7881                # Exon
 7882                if use_exon:
 7883                    exon = transcript.find_exon_number(pos)
 7884                else:
 7885                    exon = None
 7886                # Protein
 7887                transcript_protein = None
 7888                if use_protein or add_protein or full_format:
 7889                    transcripts_protein = list(
 7890                        polars_conn.execute(
 7891                            f"""
 7892                        SELECT protein
 7893                        FROM refseqlink_df
 7894                        WHERE transcript='{transcript_name}'
 7895                        LIMIT 1
 7896                    """
 7897                        )["protein"]
 7898                    )
 7899                    if len(transcripts_protein):
 7900                        transcript_protein = transcripts_protein[0]
 7901
 7902                # HGVS name
 7903                hgvs_name = format_hgvs_name(
 7904                    chr,
 7905                    pos,
 7906                    ref,
 7907                    alt,
 7908                    genome=genome,
 7909                    transcript=transcript,
 7910                    transcript_protein=transcript_protein,
 7911                    exon=exon,
 7912                    use_gene=use_gene,
 7913                    use_protein=use_protein,
 7914                    full_format=full_format,
 7915                    use_version=use_version,
 7916                    codon_type=codon_type,
 7917                )
 7918                hgvs_full_list.append(hgvs_name)
 7919                if add_protein and not use_protein and not full_format:
 7920                    hgvs_name = format_hgvs_name(
 7921                        chr,
 7922                        pos,
 7923                        ref,
 7924                        alt,
 7925                        genome=genome,
 7926                        transcript=transcript,
 7927                        transcript_protein=transcript_protein,
 7928                        exon=exon,
 7929                        use_gene=use_gene,
 7930                        use_protein=True,
 7931                        full_format=False,
 7932                        use_version=use_version,
 7933                        codon_type=codon_type,
 7934                    )
 7935                    hgvs_full_list.append(hgvs_name)
 7936
 7937            # Create liste of HGVS annotations
 7938            hgvs_full = ",".join(hgvs_full_list)
 7939
 7940            return hgvs_full
 7941
 7942        # Polars connexion
 7943        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7944
 7945        # Config
 7946        config = self.get_config()
 7947
 7948        # Databases
 7949        # Genome
 7950        databases_genomes_folders = (
 7951            config.get("folders", {})
 7952            .get("databases", {})
 7953            .get("genomes", DEFAULT_GENOME_FOLDER)
 7954        )
 7955        databases_genome = (
 7956            config.get("folders", {}).get("databases", {}).get("genomes", "")
 7957        )
 7958        # refseq database folder
 7959        databases_refseq_folders = (
 7960            config.get("folders", {})
 7961            .get("databases", {})
 7962            .get("refseq", DEFAULT_REFSEQ_FOLDER)
 7963        )
 7964        # refseq
 7965        databases_refseq = config.get("databases", {}).get("refSeq", None)
 7966        # refSeqLink
 7967        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
 7968
 7969        # Param
 7970        param = self.get_param()
 7971
 7972        # Quick HGVS
 7973        if "hgvs_options" in param and param.get("hgvs_options", ""):
 7974            log.info(f"Quick HGVS Annotation:")
 7975            if not param.get("hgvs", None):
 7976                param["hgvs"] = {}
 7977            for option in param.get("hgvs_options", "").split(","):
 7978                option_var_val = option.split("=")
 7979                option_var = option_var_val[0]
 7980                if len(option_var_val) > 1:
 7981                    option_val = option_var_val[1]
 7982                else:
 7983                    option_val = "True"
 7984                if option_val.upper() in ["TRUE"]:
 7985                    option_val = True
 7986                elif option_val.upper() in ["FALSE"]:
 7987                    option_val = False
 7988                log.info(f"   {option_var}={option_val}")
 7989                param["hgvs"][option_var] = option_val
 7990
 7991        # Check if HGVS annotation enabled
 7992        if "hgvs" in param:
 7993            log.info(f"HGVS Annotation... ")
 7994            for hgvs_option in param.get("hgvs", {}):
 7995                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
 7996        else:
 7997            return
 7998
 7999        # HGVS Param
 8000        param_hgvs = param.get("hgvs", {})
 8001        use_exon = param_hgvs.get("use_exon", False)
 8002        use_gene = param_hgvs.get("use_gene", False)
 8003        use_protein = param_hgvs.get("use_protein", False)
 8004        add_protein = param_hgvs.get("add_protein", False)
 8005        full_format = param_hgvs.get("full_format", False)
 8006        use_version = param_hgvs.get("use_version", False)
 8007        codon_type = param_hgvs.get("codon_type", "3")
 8008
 8009        # refSseq refSeqLink
 8010        databases_refseq = param_hgvs.get("refseq", databases_refseq)
 8011        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
 8012
 8013        # Assembly
 8014        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 8015
 8016        # Genome
 8017        genome_file = None
 8018        if find_genome(databases_genome):
 8019            genome_file = find_genome(databases_genome)
 8020        else:
 8021            genome_file = find_genome(
 8022                genome_path=databases_genomes_folders, assembly=assembly
 8023            )
 8024        log.debug("Genome: " + str(genome_file))
 8025
 8026        # refSseq
 8027        refseq_file = find_file_prefix(
 8028            input_file=databases_refseq,
 8029            prefix="ncbiRefSeq",
 8030            folder=databases_refseq_folders,
 8031            assembly=assembly,
 8032        )
 8033        log.debug("refSeq: " + str(refseq_file))
 8034
 8035        # refSeqLink
 8036        refseqlink_file = find_file_prefix(
 8037            input_file=databases_refseqlink,
 8038            prefix="ncbiRefSeqLink",
 8039            folder=databases_refseq_folders,
 8040            assembly=assembly,
 8041        )
 8042        log.debug("refSeqLink: " + str(refseqlink_file))
 8043
 8044        # Threads
 8045        if not threads:
 8046            threads = self.get_threads()
 8047        log.debug("Threads: " + str(threads))
 8048
 8049        # Variables
 8050        table_variants = self.get_table_variants(clause="update")
 8051
 8052        # Get variants SNV and InDel only
 8053        query_variants = f"""
 8054            SELECT "#CHROM" AS CHROM, POS, REF, ALT
 8055            FROM {table_variants}
 8056            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
 8057            """
 8058        df_variants = self.get_query_to_df(query_variants)
 8059
 8060        # Added columns
 8061        added_columns = []
 8062
 8063        # Add hgvs column in variants table
 8064        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
 8065        added_column = self.add_column(
 8066            table_variants, hgvs_column_name, "STRING", default_value=None
 8067        )
 8068        added_columns.append(added_column)
 8069
 8070        log.debug(f"refSeq loading...")
 8071        # refSeq in duckDB
 8072        refseq_table = get_refseq_table(
 8073            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
 8074        )
 8075        # Loading all refSeq in Dataframe
 8076        refseq_query = f"""
 8077            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
 8078            FROM {refseq_table}
 8079            JOIN df_variants ON (
 8080                {refseq_table}.chrom = df_variants.CHROM
 8081                AND {refseq_table}.txStart<=df_variants.POS
 8082                AND {refseq_table}.txEnd>=df_variants.POS
 8083            )
 8084        """
 8085        refseq_df = self.conn.query(refseq_query).pl()
 8086
 8087        if refseqlink_file:
 8088            log.debug(f"refSeqLink loading...")
 8089            # refSeqLink in duckDB
 8090            refseqlink_table = get_refseq_table(
 8091                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
 8092            )
 8093            # Loading all refSeqLink in Dataframe
 8094            protacc_column = "protAcc_with_ver"
 8095            mrnaacc_column = "mrnaAcc_with_ver"
 8096            refseqlink_query = f"""
 8097                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
 8098                FROM {refseqlink_table} 
 8099                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
 8100                WHERE protAcc_without_ver IS NOT NULL
 8101            """
 8102            # Polars Dataframe
 8103            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
 8104
 8105        # Read RefSeq transcripts into a python dict/model.
 8106        log.debug(f"Transcripts loading...")
 8107        with tempfile.TemporaryDirectory() as tmpdir:
 8108            transcripts_query = f"""
 8109                COPY (
 8110                    SELECT {refseq_table}.*
 8111                    FROM {refseq_table}
 8112                    JOIN df_variants ON (
 8113                        {refseq_table}.chrom=df_variants.CHROM
 8114                        AND {refseq_table}.txStart<=df_variants.POS
 8115                        AND {refseq_table}.txEnd>=df_variants.POS
 8116                    )
 8117                )
 8118                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
 8119            """
 8120            self.conn.query(transcripts_query)
 8121            with open(f"{tmpdir}/transcript.tsv") as infile:
 8122                transcripts = read_transcripts(infile)
 8123
 8124        # Polars connexion
 8125        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 8126
 8127        log.debug("Genome loading...")
 8128        # Read genome sequence using pyfaidx.
 8129        genome = Fasta(genome_file)
 8130
 8131        log.debug("Start annotation HGVS...")
 8132
 8133        # Create
 8134        # a Dask Dataframe from Pandas dataframe with partition as number of threads
 8135        ddf = dd.from_pandas(df_variants, npartitions=threads)
 8136
 8137        # Use dask.dataframe.apply() to apply function on each partition
 8138        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
 8139
 8140        # Convert Dask DataFrame to Pandas Dataframe
 8141        df = ddf.compute()
 8142
 8143        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
 8144        with tempfile.TemporaryDirectory() as tmpdir:
 8145            df_parquet = os.path.join(tmpdir, "df.parquet")
 8146            df.to_parquet(df_parquet)
 8147
 8148            # Update hgvs column
 8149            update_variant_query = f"""
 8150                UPDATE {table_variants}
 8151                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
 8152                FROM read_parquet('{df_parquet}') as df
 8153                WHERE variants."#CHROM" = df.CHROM
 8154                AND variants.POS = df.POS
 8155                AND variants.REF = df.REF
 8156                AND variants.ALT = df.ALT
 8157                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
 8158                """
 8159            self.execute_query(update_variant_query)
 8160
 8161        # Update INFO column
 8162        sql_query_update = f"""
 8163            UPDATE {table_variants}
 8164            SET INFO = 
 8165                concat(
 8166                    CASE 
 8167                        WHEN INFO NOT IN ('','.')
 8168                        THEN concat(INFO, ';')
 8169                        ELSE ''
 8170                    END,
 8171                    'hgvs=',
 8172                    {hgvs_column_name}
 8173                )
 8174            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
 8175            """
 8176        self.execute_query(sql_query_update)
 8177
 8178        # Add header
 8179        HGVS_INFOS = {
 8180            "hgvs": {
 8181                "ID": "hgvs",
 8182                "Number": ".",
 8183                "Type": "String",
 8184                "Description": f"HGVS annotatation with HOWARD",
 8185            }
 8186        }
 8187
 8188        for field in HGVS_INFOS:
 8189            field_ID = HGVS_INFOS[field]["ID"]
 8190            field_description = HGVS_INFOS[field]["Description"]
 8191            self.get_header().infos[field_ID] = vcf.parser._Info(
 8192                field_ID,
 8193                HGVS_INFOS[field]["Number"],
 8194                HGVS_INFOS[field]["Type"],
 8195                field_description,
 8196                "unknown",
 8197                "unknown",
 8198                code_type_map[HGVS_INFOS[field]["Type"]],
 8199            )
 8200
 8201        # Remove added columns
 8202        for added_column in added_columns:
 8203            self.drop_column(column=added_column)
 8204
 8205    ###
 8206    # Calculation
 8207    ###
 8208
 8209    def get_operations_help(
 8210        self, operations_config_dict: dict = {}, operations_config_file: str = None
 8211    ) -> list:
 8212
 8213        # Init
 8214        operations_help = []
 8215
 8216        # operations
 8217        operations = self.get_config_json(
 8218            name="calculations",
 8219            config_dict=operations_config_dict,
 8220            config_file=operations_config_file,
 8221        )
 8222        for op in operations:
 8223            op_name = operations[op].get("name", op).upper()
 8224            op_description = operations[op].get("description", op_name)
 8225            op_available = operations[op].get("available", False)
 8226            if op_available:
 8227                operations_help.append(f"   {op_name}: {op_description}")
 8228
 8229        # Sort operations
 8230        operations_help.sort()
 8231
 8232        # insert header
 8233        operations_help.insert(0, "Available calculation operations:")
 8234
 8235        # Return
 8236        return operations_help
 8237
 8238    def calculation(
 8239        self,
 8240        operations: dict = {},
 8241        operations_config_dict: dict = {},
 8242        operations_config_file: str = None,
 8243    ) -> None:
 8244        """
 8245        It takes a list of operations, and for each operation, it checks if it's a python or sql
 8246        operation, and then calls the appropriate function
 8247
 8248        param json example:
 8249            "calculation": {
 8250                "NOMEN": {
 8251                    "options": {
 8252                        "hgvs_field": "hgvs"
 8253                    },
 8254                "middle" : null
 8255            }
 8256        """
 8257
 8258        # Param
 8259        param = self.get_param()
 8260
 8261        # CHeck operations config file
 8262        if operations_config_file is None:
 8263            operations_config_file = param.get("calculation", {}).get(
 8264                "calculation_config", None
 8265            )
 8266
 8267        # operations config
 8268        operations_config = self.get_config_json(
 8269            name="calculations",
 8270            config_dict=operations_config_dict,
 8271            config_file=operations_config_file,
 8272        )
 8273
 8274        # Upper keys
 8275        operations_config = {k.upper(): v for k, v in operations_config.items()}
 8276
 8277        # Calculations
 8278
 8279        # Operations from param
 8280        operations = param.get("calculation", {}).get("calculations", operations)
 8281
 8282        # Quick calculation - add
 8283        if param.get("calculations", None):
 8284
 8285            # List of operations
 8286            calculations_list = [
 8287                value.strip() for value in param.get("calculations", "").split(",")
 8288            ]
 8289
 8290            # Log
 8291            log.info(f"Quick Calculations:")
 8292            for calculation_key in calculations_list:
 8293                log.info(f"   {calculation_key}")
 8294
 8295            # Create tmp operations (to keep operation order)
 8296            operations_tmp = {}
 8297            for calculation_operation in calculations_list:
 8298                if calculation_operation.upper() not in operations_tmp:
 8299                    log.debug(
 8300                        f"{calculation_operation}.upper() not in {operations_tmp}"
 8301                    )
 8302                    operations_tmp[calculation_operation.upper()] = {}
 8303                    add_value_into_dict(
 8304                        dict_tree=operations_tmp,
 8305                        sections=[
 8306                            calculation_operation.upper(),
 8307                        ],
 8308                        value=operations.get(calculation_operation.upper(), {}),
 8309                    )
 8310            # Add operations already in param
 8311            for calculation_operation in operations:
 8312                if calculation_operation not in operations_tmp:
 8313                    operations_tmp[calculation_operation] = operations.get(
 8314                        calculation_operation, {}
 8315                    )
 8316
 8317            # Update operations in param
 8318            operations = operations_tmp
 8319
 8320        # Operations for calculation
 8321        if not operations:
 8322            operations = param.get("calculation", {}).get("calculations", {})
 8323
 8324        if operations:
 8325            log.info(f"Calculations...")
 8326
 8327        # For each operations
 8328        for operation_name in operations:
 8329            operation_name = operation_name.upper()
 8330            if operation_name not in [""]:
 8331                if operation_name in operations_config:
 8332                    log.info(f"Calculation '{operation_name}'")
 8333                    operation = operations_config[operation_name]
 8334                    operation_type = operation.get("type", "sql")
 8335                    if operation_type == "python":
 8336                        self.calculation_process_function(
 8337                            operation=operation, operation_name=operation_name
 8338                        )
 8339                    elif operation_type == "sql":
 8340                        self.calculation_process_sql(
 8341                            operation=operation, operation_name=operation_name
 8342                        )
 8343                    else:
 8344                        log.error(
 8345                            f"Operations config: Type '{operation_type}' NOT available"
 8346                        )
 8347                        raise ValueError(
 8348                            f"Operations config: Type '{operation_type}' NOT available"
 8349                        )
 8350                else:
 8351                    log.error(
 8352                        f"Operations config: Calculation '{operation_name}' NOT available"
 8353                    )
 8354                    raise ValueError(
 8355                        f"Operations config: Calculation '{operation_name}' NOT available"
 8356                    )
 8357
 8358        # Explode INFOS fields into table fields
 8359        if self.get_explode_infos():
 8360            self.explode_infos(
 8361                prefix=self.get_explode_infos_prefix(),
 8362                fields=self.get_explode_infos_fields(),
 8363                force=True,
 8364            )
 8365
 8366    def calculation_process_sql(
 8367        self, operation: dict, operation_name: str = "unknown"
 8368    ) -> None:
 8369        """
 8370        The `calculation_process_sql` function takes in a mathematical operation as a string and
 8371        performs the operation, updating the specified table with the result.
 8372
 8373        :param operation: The `operation` parameter is a dictionary that contains information about the
 8374        mathematical operation to be performed. It includes the following keys:
 8375        :type operation: dict
 8376        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8377        the mathematical operation being performed. It is used for logging and error handling purposes,
 8378        defaults to unknown
 8379        :type operation_name: str (optional)
 8380        """
 8381
 8382        # Operation infos
 8383        operation_name = operation.get("name", "unknown")
 8384        log.debug(f"process SQL {operation_name}")
 8385        output_column_name = operation.get("output_column_name", operation_name)
 8386        output_column_type = operation.get("output_column_type", "String")
 8387        prefix = operation.get("explode_infos_prefix", "")
 8388        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
 8389        output_column_description = operation.get(
 8390            "output_column_description", f"{operation_name} operation"
 8391        )
 8392        operation_query = operation.get("operation_query", None)
 8393        if isinstance(operation_query, list):
 8394            operation_query = " ".join(operation_query)
 8395        operation_info_fields = operation.get("info_fields", [])
 8396        operation_info_fields_check = operation.get("info_fields_check", False)
 8397        operation_info = operation.get("operation_info", True)
 8398        operation_table = operation.get(
 8399            "table", self.get_table_variants(clause="alter")
 8400        )
 8401
 8402        # table variants
 8403        if operation_table:
 8404            table_variants = operation_table
 8405        else:
 8406            table_variants = self.get_table_variants(clause="alter")
 8407
 8408        if operation_query:
 8409
 8410            # Info fields check
 8411            operation_info_fields_check_result = True
 8412            if operation_info_fields_check:
 8413                header_infos = self.get_header().infos
 8414                for info_field in operation_info_fields:
 8415                    operation_info_fields_check_result = (
 8416                        operation_info_fields_check_result
 8417                        and info_field in header_infos
 8418                    )
 8419
 8420            # If info fields available
 8421            if operation_info_fields_check_result:
 8422
 8423                # Added_columns
 8424                added_columns = []
 8425
 8426                # Create VCF header field
 8427                vcf_reader = self.get_header()
 8428                vcf_reader.infos[output_column_name] = vcf.parser._Info(
 8429                    output_column_name,
 8430                    ".",
 8431                    output_column_type,
 8432                    output_column_description,
 8433                    "howard calculation",
 8434                    "0",
 8435                    self.code_type_map.get(output_column_type),
 8436                )
 8437
 8438                # Explode infos if needed
 8439                log.debug(f"calculation_process_sql prefix {prefix}")
 8440                added_columns += self.explode_infos(
 8441                    prefix=prefix,
 8442                    fields=[output_column_name] + operation_info_fields,
 8443                    force=False,
 8444                    table=table_variants,
 8445                )
 8446
 8447                # Create column
 8448                added_column = self.add_column(
 8449                    table_name=table_variants,
 8450                    column_name=prefix + output_column_name,
 8451                    column_type=output_column_type_sql,
 8452                    default_value="null",
 8453                )
 8454                added_columns.append(added_column)
 8455
 8456                # Operation calculation
 8457                try:
 8458
 8459                    # Query to update calculation column
 8460                    sql_update = f"""
 8461                        UPDATE {table_variants}
 8462                        SET "{prefix}{output_column_name}" = ({operation_query})
 8463                    """
 8464                    self.conn.execute(sql_update)
 8465
 8466                    # Add to INFO
 8467                    if operation_info:
 8468                        sql_update_info = f"""
 8469                            UPDATE {table_variants}
 8470                            SET "INFO" =
 8471                                concat(
 8472                                    CASE
 8473                                        WHEN "INFO" IS NOT NULL
 8474                                        THEN concat("INFO", ';')
 8475                                        ELSE ''
 8476                                    END,
 8477                                    '{output_column_name}=',
 8478                                    "{prefix}{output_column_name}"
 8479                                )
 8480                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
 8481                        """
 8482                        self.conn.execute(sql_update_info)
 8483
 8484                except:
 8485                    log.error(
 8486                        f"Operations config: Calculation '{operation_name}' query failed"
 8487                    )
 8488                    raise ValueError(
 8489                        f"Operations config: Calculation '{operation_name}' query failed"
 8490                    )
 8491
 8492                # Remove added columns
 8493                for added_column in added_columns:
 8494                    log.debug(f"added_column: {added_column}")
 8495                    self.drop_column(column=added_column)
 8496
 8497            else:
 8498                log.error(
 8499                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8500                )
 8501                raise ValueError(
 8502                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8503                )
 8504
 8505        else:
 8506            log.error(
 8507                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8508            )
 8509            raise ValueError(
 8510                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8511            )
 8512
 8513    def calculation_process_function(
 8514        self, operation: dict, operation_name: str = "unknown"
 8515    ) -> None:
 8516        """
 8517        The `calculation_process_function` takes in an operation dictionary and performs the specified
 8518        function with the given parameters.
 8519
 8520        :param operation: The `operation` parameter is a dictionary that contains information about the
 8521        operation to be performed. It has the following keys:
 8522        :type operation: dict
 8523        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8524        the operation being performed. It is used for logging purposes, defaults to unknown
 8525        :type operation_name: str (optional)
 8526        """
 8527
 8528        operation_name = operation["name"]
 8529        log.debug(f"process Python {operation_name}")
 8530        function_name = operation["function_name"]
 8531        function_params = operation["function_params"]
 8532        getattr(self, function_name)(*function_params)
 8533
 8534    def calculation_variant_id(self) -> None:
 8535        """
 8536        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
 8537        updates the INFO field of a variants table with the variant ID.
 8538        """
 8539
 8540        # variant_id annotation field
 8541        variant_id_tag = self.get_variant_id_column()
 8542        added_columns = [variant_id_tag]
 8543
 8544        # variant_id hgvs tags"
 8545        vcf_infos_tags = {
 8546            variant_id_tag: "howard variant ID annotation",
 8547        }
 8548
 8549        # Variants table
 8550        table_variants = self.get_table_variants()
 8551
 8552        # Header
 8553        vcf_reader = self.get_header()
 8554
 8555        # Add variant_id to header
 8556        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
 8557            variant_id_tag,
 8558            ".",
 8559            "String",
 8560            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
 8561            "howard calculation",
 8562            "0",
 8563            self.code_type_map.get("String"),
 8564        )
 8565
 8566        # Update
 8567        sql_update = f"""
 8568            UPDATE {table_variants}
 8569            SET "INFO" = 
 8570                concat(
 8571                    CASE
 8572                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8573                        THEN ''
 8574                        ELSE concat("INFO", ';')
 8575                    END,
 8576                    '{variant_id_tag}=',
 8577                    "{variant_id_tag}"
 8578                )
 8579        """
 8580        self.conn.execute(sql_update)
 8581
 8582        # Remove added columns
 8583        for added_column in added_columns:
 8584            self.drop_column(column=added_column)
 8585
 8586    def calculation_extract_snpeff_hgvs(
 8587        self,
 8588        snpeff_hgvs: str = "snpeff_hgvs",
 8589        snpeff_field: str = "ANN",
 8590    ) -> None:
 8591        """
 8592        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
 8593        annotation field in a VCF file and adds them as a new column in the variants table.
 8594
 8595        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
 8596        function is used to specify the name of the column that will store the HGVS nomenclatures
 8597        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
 8598        snpeff_hgvs
 8599        :type snpeff_hgvs: str (optional)
 8600        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
 8601        function represents the field in the VCF file that contains SnpEff annotations. This field is
 8602        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
 8603        to ANN
 8604        :type snpeff_field: str (optional)
 8605        """
 8606
 8607        # Snpeff hgvs tags
 8608        vcf_infos_tags = {
 8609            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
 8610        }
 8611
 8612        # Prefix
 8613        prefix = self.get_explode_infos_prefix()
 8614        if prefix:
 8615            prefix = "INFO/"
 8616
 8617        # snpEff fields
 8618        speff_ann_infos = prefix + snpeff_field
 8619        speff_hgvs_infos = prefix + snpeff_hgvs
 8620
 8621        # Variants table
 8622        table_variants = self.get_table_variants()
 8623
 8624        # Header
 8625        vcf_reader = self.get_header()
 8626
 8627        # Add columns
 8628        added_columns = []
 8629
 8630        # Explode HGVS field in column
 8631        added_columns += self.explode_infos(fields=[snpeff_field])
 8632
 8633        if snpeff_field in vcf_reader.infos:
 8634
 8635            log.debug(vcf_reader.infos[snpeff_field])
 8636
 8637            # Extract ANN header
 8638            ann_description = vcf_reader.infos[snpeff_field].desc
 8639            pattern = r"'(.+?)'"
 8640            match = re.search(pattern, ann_description)
 8641            if match:
 8642                ann_header_match = match.group(1).split(" | ")
 8643                ann_header_desc = {}
 8644                for i in range(len(ann_header_match)):
 8645                    ann_header_info = "".join(
 8646                        char for char in ann_header_match[i] if char.isalnum()
 8647                    )
 8648                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8649                if not ann_header_desc:
 8650                    raise ValueError("Invalid header description format")
 8651            else:
 8652                raise ValueError("Invalid header description format")
 8653
 8654            # Create variant id
 8655            variant_id_column = self.get_variant_id_column()
 8656            added_columns += [variant_id_column]
 8657
 8658            # Create dataframe
 8659            dataframe_snpeff_hgvs = self.get_query_to_df(
 8660                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8661            )
 8662
 8663            # Create main NOMEN column
 8664            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8665                speff_ann_infos
 8666            ].apply(
 8667                lambda x: extract_snpeff_hgvs(
 8668                    str(x), header=list(ann_header_desc.values())
 8669                )
 8670            )
 8671
 8672            # Add snpeff_hgvs to header
 8673            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
 8674                snpeff_hgvs,
 8675                ".",
 8676                "String",
 8677                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
 8678                "howard calculation",
 8679                "0",
 8680                self.code_type_map.get("String"),
 8681            )
 8682
 8683            # Update
 8684            sql_update = f"""
 8685                UPDATE variants
 8686                SET "INFO" = 
 8687                    concat(
 8688                        CASE
 8689                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8690                            THEN ''
 8691                            ELSE concat("INFO", ';')
 8692                        END,
 8693                        CASE 
 8694                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8695                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8696                            THEN concat(
 8697                                    '{snpeff_hgvs}=',
 8698                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8699                                )
 8700                            ELSE ''
 8701                        END
 8702                    )
 8703                FROM dataframe_snpeff_hgvs
 8704                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8705
 8706            """
 8707            self.conn.execute(sql_update)
 8708
 8709            # Delete dataframe
 8710            del dataframe_snpeff_hgvs
 8711            gc.collect()
 8712
 8713        else:
 8714
 8715            log.warning(
 8716                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8717            )
 8718
 8719        # Remove added columns
 8720        for added_column in added_columns:
 8721            self.drop_column(column=added_column)
 8722
 8723    def calculation_snpeff_ann_explode(
 8724        self,
 8725        uniquify: bool = True,
 8726        output_format: str = "fields",
 8727        output_prefix: str = "snpeff_",
 8728        snpeff_field: str = "ANN",
 8729    ) -> None:
 8730        """
 8731        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
 8732        exploding the HGVS field and updating variant information accordingly.
 8733
 8734        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
 8735        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
 8736        it indicates that the output should be unique, meaning that duplicate entries should be removed,
 8737        defaults to True
 8738        :type uniquify: bool (optional)
 8739        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
 8740        function specifies the format in which the output annotations will be generated. It has a
 8741        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
 8742        format, defaults to fields
 8743        :type output_format: str (optional)
 8744        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
 8745        method is used to specify the prefix that will be added to the output annotations generated
 8746        during the calculation process. This prefix helps to differentiate the newly added annotations
 8747        from existing ones in the output data. By default, the, defaults to ANN_
 8748        :type output_prefix: str (optional)
 8749        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
 8750        function is used to specify the field in the VCF file that contains SnpEff annotations. This
 8751        field will be processed to explode the HGVS annotations and update the variant information
 8752        accordingly, defaults to ANN
 8753        :type snpeff_field: str (optional)
 8754        """
 8755
 8756        # SnpEff annotation field
 8757        snpeff_hgvs = "snpeff_ann_explode"
 8758
 8759        # Snpeff hgvs tags
 8760        vcf_infos_tags = {
 8761            snpeff_hgvs: "Explode snpEff annotations",
 8762        }
 8763
 8764        # Prefix
 8765        prefix = self.get_explode_infos_prefix()
 8766        if prefix:
 8767            prefix = "INFO/"
 8768
 8769        # snpEff fields
 8770        speff_ann_infos = prefix + snpeff_field
 8771        speff_hgvs_infos = prefix + snpeff_hgvs
 8772
 8773        # Variants table
 8774        table_variants = self.get_table_variants()
 8775
 8776        # Header
 8777        vcf_reader = self.get_header()
 8778
 8779        # Add columns
 8780        added_columns = []
 8781
 8782        # Explode HGVS field in column
 8783        added_columns += self.explode_infos(fields=[snpeff_field])
 8784        log.debug(f"snpeff_field={snpeff_field}")
 8785        log.debug(f"added_columns={added_columns}")
 8786
 8787        if snpeff_field in vcf_reader.infos:
 8788
 8789            # Extract ANN header
 8790            ann_description = vcf_reader.infos[snpeff_field].desc
 8791            pattern = r"'(.+?)'"
 8792            match = re.search(pattern, ann_description)
 8793            if match:
 8794                ann_header_match = match.group(1).split(" | ")
 8795                ann_header = []
 8796                ann_header_desc = {}
 8797                for i in range(len(ann_header_match)):
 8798                    ann_header_info = "".join(
 8799                        char for char in ann_header_match[i] if char.isalnum()
 8800                    )
 8801                    ann_header.append(ann_header_info)
 8802                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8803                if not ann_header_desc:
 8804                    raise ValueError("Invalid header description format")
 8805            else:
 8806                raise ValueError("Invalid header description format")
 8807
 8808            # Create variant id
 8809            variant_id_column = self.get_variant_id_column()
 8810            added_columns += [variant_id_column]
 8811
 8812            # Create dataframe
 8813            dataframe_snpeff_hgvs = self.get_query_to_df(
 8814                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8815            )
 8816
 8817            # Create snpEff columns
 8818            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8819                speff_ann_infos
 8820            ].apply(
 8821                lambda x: explode_snpeff_ann(
 8822                    str(x),
 8823                    uniquify=uniquify,
 8824                    output_format=output_format,
 8825                    prefix=output_prefix,
 8826                    header=list(ann_header_desc.values()),
 8827                )
 8828            )
 8829
 8830            # Header
 8831            ann_annotations_prefix = ""
 8832            if output_format.upper() in ["JSON"]:
 8833                ann_annotations_prefix = f"{output_prefix}="
 8834                vcf_reader.infos[output_prefix] = vcf.parser._Info(
 8835                    output_prefix,
 8836                    ".",
 8837                    "String",
 8838                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8839                    + " - JSON format",
 8840                    "howard calculation",
 8841                    "0",
 8842                    self.code_type_map.get("String"),
 8843                )
 8844            else:
 8845                for ann_annotation in ann_header:
 8846                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
 8847                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
 8848                        ann_annotation_id,
 8849                        ".",
 8850                        "String",
 8851                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8852                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
 8853                        "howard calculation",
 8854                        "0",
 8855                        self.code_type_map.get("String"),
 8856                    )
 8857
 8858            # Update
 8859            sql_update = f"""
 8860                UPDATE variants
 8861                SET "INFO" = 
 8862                    concat(
 8863                        CASE
 8864                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8865                            THEN ''
 8866                            ELSE concat("INFO", ';')
 8867                        END,
 8868                        CASE 
 8869                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8870                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8871                            THEN concat(
 8872                                '{ann_annotations_prefix}',
 8873                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8874                                )
 8875                            ELSE ''
 8876                        END
 8877                    )
 8878                FROM dataframe_snpeff_hgvs
 8879                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8880
 8881            """
 8882            self.conn.execute(sql_update)
 8883
 8884            # Delete dataframe
 8885            del dataframe_snpeff_hgvs
 8886            gc.collect()
 8887
 8888        else:
 8889
 8890            log.warning(
 8891                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8892            )
 8893
 8894        # Remove added columns
 8895        for added_column in added_columns:
 8896            self.drop_column(column=added_column)
 8897
 8898    def calculation_extract_nomen(self) -> None:
 8899        """
 8900        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
 8901        """
 8902
 8903        # NOMEN field
 8904        field_nomen_dict = "NOMEN_DICT"
 8905
 8906        # NOMEN structure
 8907        nomen_dict = {
 8908            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
 8909            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
 8910            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
 8911            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
 8912            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
 8913            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
 8914            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
 8915            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
 8916            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
 8917            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
 8918        }
 8919
 8920        # Param
 8921        param = self.get_param()
 8922
 8923        # Threads
 8924        threads = self.get_threads()
 8925
 8926        # Prefix
 8927        prefix = self.get_explode_infos_prefix()
 8928
 8929        # Header
 8930        vcf_reader = self.get_header()
 8931
 8932        # Added columns
 8933        added_columns = []
 8934
 8935        # Get HGVS field
 8936        hgvs_field = (
 8937            param.get("calculation", {})
 8938            .get("calculations", {})
 8939            .get("NOMEN", {})
 8940            .get("options", {})
 8941            .get("hgvs_field", "hgvs")
 8942        )
 8943
 8944        # Get NOMEN pattern
 8945        nomen_pattern = (
 8946            param.get("calculation", {})
 8947            .get("calculations", {})
 8948            .get("NOMEN", {})
 8949            .get("options", {})
 8950            .get("pattern", None)
 8951        )
 8952
 8953        # transcripts list of preference sources
 8954        transcripts_sources = {}
 8955
 8956        # Get transcripts
 8957        transcripts_file = (
 8958            param.get("calculation", {})
 8959            .get("calculations", {})
 8960            .get("NOMEN", {})
 8961            .get("options", {})
 8962            .get("transcripts", None)
 8963        )
 8964        transcripts_file = full_path(transcripts_file)
 8965        if transcripts_file:
 8966            if os.path.exists(transcripts_file):
 8967                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
 8968                transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist()
 8969                transcripts_sources["file"] = transcripts_from_file
 8970            else:
 8971                msg_err = f"Transcript file '{transcripts_file}' does NOT exist"
 8972                log.error(msg_err)
 8973                raise ValueError(msg_err)
 8974
 8975        # Get transcripts table
 8976        transcripts_table = (
 8977            param.get("calculation", {})
 8978            .get("calculations", {})
 8979            .get("NOMEN", {})
 8980            .get("options", {})
 8981            .get("transcripts_table", self.get_table_variants())
 8982        )
 8983        # Get transcripts column
 8984        transcripts_column = (
 8985            param.get("calculation", {})
 8986            .get("calculations", {})
 8987            .get("NOMEN", {})
 8988            .get("options", {})
 8989            .get("transcripts_column", None)
 8990        )
 8991
 8992        if transcripts_table and transcripts_column:
 8993            extra_field_transcript = f"{transcripts_table}.{transcripts_column}"
 8994            # Explode if not exists
 8995            added_columns += self.explode_infos(
 8996                fields=[transcripts_column], table=transcripts_table
 8997            )
 8998        else:
 8999            extra_field_transcript = f"NULL"
 9000
 9001        # Transcripts of preference source order
 9002        transcripts_order = (
 9003            param.get("calculation", {})
 9004            .get("calculations", {})
 9005            .get("NOMEN", {})
 9006            .get("options", {})
 9007            .get("transcripts_order", ["column", "file"])
 9008        )
 9009
 9010        # Transcripts from file
 9011        transcripts = transcripts_sources.get("file", [])
 9012
 9013        # Explode HGVS field in column
 9014        added_columns += self.explode_infos(fields=[hgvs_field])
 9015
 9016        # extra infos
 9017        extra_infos = self.get_extra_infos()
 9018        extra_field = prefix + hgvs_field
 9019
 9020        if extra_field in extra_infos:
 9021
 9022            # Create dataframe
 9023            dataframe_hgvs = self.get_query_to_df(
 9024                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """
 9025            )
 9026
 9027            # Transcripts rank
 9028            transcripts_rank = {
 9029                transcript: rank for rank, transcript in enumerate(transcripts, start=1)
 9030            }
 9031            transcripts_len = len(transcripts_rank)
 9032
 9033            # Create main NOMEN column
 9034            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply(
 9035                lambda x: find_nomen(
 9036                    hgvs=x.hgvs,
 9037                    transcript=x.transcript,
 9038                    transcripts=transcripts_rank,
 9039                    pattern=nomen_pattern,
 9040                    transcripts_source_order=transcripts_order,
 9041                    transcripts_len=transcripts_len,
 9042                ),
 9043                axis=1,
 9044            )
 9045
 9046            # Explode NOMEN Structure and create SQL set for update
 9047            sql_nomen_fields = []
 9048            for nomen_field in nomen_dict:
 9049
 9050                # Create VCF header field
 9051                vcf_reader.infos[nomen_field] = vcf.parser._Info(
 9052                    nomen_field,
 9053                    ".",
 9054                    "String",
 9055                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
 9056                    "howard calculation",
 9057                    "0",
 9058                    self.code_type_map.get("String"),
 9059                )
 9060
 9061                # Add field to SQL query update
 9062                sql_nomen_fields.append(
 9063                    f"""
 9064                        CASE 
 9065                            WHEN dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT NULL AND dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT IN ('')
 9066                            THEN concat(
 9067                                    ';{nomen_field}=',
 9068                                    dataframe_hgvs."{field_nomen_dict}"."{nomen_field}"
 9069                                )
 9070                            ELSE ''
 9071                        END
 9072                    """
 9073                )
 9074
 9075            # SQL set for update
 9076            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
 9077
 9078            # Update
 9079            sql_update = f"""
 9080                UPDATE variants
 9081                SET "INFO" = 
 9082                    concat(
 9083                        CASE
 9084                            WHEN "INFO" IS NULL
 9085                            THEN ''
 9086                            ELSE "INFO"
 9087                        END,
 9088                        {sql_nomen_fields_set}
 9089                    )
 9090                FROM dataframe_hgvs
 9091                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
 9092                    AND variants."POS" = dataframe_hgvs."POS" 
 9093                    AND variants."REF" = dataframe_hgvs."REF"
 9094                    AND variants."ALT" = dataframe_hgvs."ALT"
 9095            """
 9096            self.conn.execute(sql_update)
 9097
 9098            # Delete dataframe
 9099            del dataframe_hgvs
 9100            gc.collect()
 9101
 9102        # Remove added columns
 9103        for added_column in added_columns:
 9104            self.drop_column(column=added_column)
 9105
 9106    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
 9107        """
 9108        The function `calculation_find_by_pipeline` performs a calculation to find the number of
 9109        pipeline/sample for a variant and updates the variant information in a VCF file.
 9110
 9111        :param tag: The `tag` parameter is a string that represents the annotation field for the
 9112        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
 9113        VCF header and to update the corresponding field in the variants table, defaults to
 9114        findbypipeline
 9115        :type tag: str (optional)
 9116        """
 9117
 9118        # if FORMAT and samples
 9119        if (
 9120            "FORMAT" in self.get_header_columns_as_list()
 9121            and self.get_header_sample_list()
 9122        ):
 9123
 9124            # findbypipeline annotation field
 9125            findbypipeline_tag = tag
 9126
 9127            # VCF infos tags
 9128            vcf_infos_tags = {
 9129                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
 9130            }
 9131
 9132            # Prefix
 9133            prefix = self.get_explode_infos_prefix()
 9134
 9135            # Field
 9136            findbypipeline_infos = prefix + findbypipeline_tag
 9137
 9138            # Variants table
 9139            table_variants = self.get_table_variants()
 9140
 9141            # Header
 9142            vcf_reader = self.get_header()
 9143
 9144            # Create variant id
 9145            variant_id_column = self.get_variant_id_column()
 9146            added_columns = [variant_id_column]
 9147
 9148            # variant_id, FORMAT and samples
 9149            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9150                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9151            )
 9152
 9153            # Create dataframe
 9154            dataframe_findbypipeline = self.get_query_to_df(
 9155                f""" SELECT {samples_fields} FROM {table_variants} """
 9156            )
 9157
 9158            # Create findbypipeline column
 9159            dataframe_findbypipeline[findbypipeline_infos] = (
 9160                dataframe_findbypipeline.apply(
 9161                    lambda row: findbypipeline(
 9162                        row, samples=self.get_header_sample_list()
 9163                    ),
 9164                    axis=1,
 9165                )
 9166            )
 9167
 9168            # Add snpeff_hgvs to header
 9169            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
 9170                findbypipeline_tag,
 9171                ".",
 9172                "String",
 9173                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
 9174                "howard calculation",
 9175                "0",
 9176                self.code_type_map.get("String"),
 9177            )
 9178
 9179            # Update
 9180            sql_update = f"""
 9181                UPDATE variants
 9182                SET "INFO" = 
 9183                    concat(
 9184                        CASE
 9185                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9186                            THEN ''
 9187                            ELSE concat("INFO", ';')
 9188                        END,
 9189                        CASE 
 9190                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
 9191                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
 9192                            THEN concat(
 9193                                    '{findbypipeline_tag}=',
 9194                                    dataframe_findbypipeline."{findbypipeline_infos}"
 9195                                )
 9196                            ELSE ''
 9197                        END
 9198                    )
 9199                FROM dataframe_findbypipeline
 9200                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
 9201            """
 9202            self.conn.execute(sql_update)
 9203
 9204            # Remove added columns
 9205            for added_column in added_columns:
 9206                self.drop_column(column=added_column)
 9207
 9208            # Delete dataframe
 9209            del dataframe_findbypipeline
 9210            gc.collect()
 9211
 9212    def calculation_genotype_concordance(self) -> None:
 9213        """
 9214        The function `calculation_genotype_concordance` calculates the genotype concordance for
 9215        multi-caller VCF files and updates the variant information in the database.
 9216        """
 9217
 9218        # if FORMAT and samples
 9219        if (
 9220            "FORMAT" in self.get_header_columns_as_list()
 9221            and self.get_header_sample_list()
 9222        ):
 9223
 9224            # genotypeconcordance annotation field
 9225            genotypeconcordance_tag = "genotypeconcordance"
 9226
 9227            # VCF infos tags
 9228            vcf_infos_tags = {
 9229                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
 9230            }
 9231
 9232            # Prefix
 9233            prefix = self.get_explode_infos_prefix()
 9234
 9235            # Field
 9236            genotypeconcordance_infos = prefix + genotypeconcordance_tag
 9237
 9238            # Variants table
 9239            table_variants = self.get_table_variants()
 9240
 9241            # Header
 9242            vcf_reader = self.get_header()
 9243
 9244            # Create variant id
 9245            variant_id_column = self.get_variant_id_column()
 9246            added_columns = [variant_id_column]
 9247
 9248            # variant_id, FORMAT and samples
 9249            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9250                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9251            )
 9252
 9253            # Create dataframe
 9254            dataframe_genotypeconcordance = self.get_query_to_df(
 9255                f""" SELECT {samples_fields} FROM {table_variants} """
 9256            )
 9257
 9258            # Create genotypeconcordance column
 9259            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
 9260                dataframe_genotypeconcordance.apply(
 9261                    lambda row: genotypeconcordance(
 9262                        row, samples=self.get_header_sample_list()
 9263                    ),
 9264                    axis=1,
 9265                )
 9266            )
 9267
 9268            # Add genotypeconcordance to header
 9269            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
 9270                genotypeconcordance_tag,
 9271                ".",
 9272                "String",
 9273                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
 9274                "howard calculation",
 9275                "0",
 9276                self.code_type_map.get("String"),
 9277            )
 9278
 9279            # Update
 9280            sql_update = f"""
 9281                UPDATE variants
 9282                SET "INFO" = 
 9283                    concat(
 9284                        CASE
 9285                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9286                            THEN ''
 9287                            ELSE concat("INFO", ';')
 9288                        END,
 9289                        CASE
 9290                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
 9291                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
 9292                            THEN concat(
 9293                                    '{genotypeconcordance_tag}=',
 9294                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
 9295                                )
 9296                            ELSE ''
 9297                        END
 9298                    )
 9299                FROM dataframe_genotypeconcordance
 9300                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
 9301            """
 9302            self.conn.execute(sql_update)
 9303
 9304            # Remove added columns
 9305            for added_column in added_columns:
 9306                self.drop_column(column=added_column)
 9307
 9308            # Delete dataframe
 9309            del dataframe_genotypeconcordance
 9310            gc.collect()
 9311
 9312    def calculation_barcode(self, tag: str = "barcode") -> None:
 9313        """
 9314        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
 9315        updates the INFO field in the file with the calculated barcode values.
 9316
 9317        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
 9318        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
 9319        the default tag name is set to "barcode", defaults to barcode
 9320        :type tag: str (optional)
 9321        """
 9322
 9323        # if FORMAT and samples
 9324        if (
 9325            "FORMAT" in self.get_header_columns_as_list()
 9326            and self.get_header_sample_list()
 9327        ):
 9328
 9329            # barcode annotation field
 9330            if not tag:
 9331                tag = "barcode"
 9332
 9333            # VCF infos tags
 9334            vcf_infos_tags = {
 9335                tag: "barcode calculation (VaRank)",
 9336            }
 9337
 9338            # Prefix
 9339            prefix = self.get_explode_infos_prefix()
 9340
 9341            # Field
 9342            barcode_infos = prefix + tag
 9343
 9344            # Variants table
 9345            table_variants = self.get_table_variants()
 9346
 9347            # Header
 9348            vcf_reader = self.get_header()
 9349
 9350            # Create variant id
 9351            variant_id_column = self.get_variant_id_column()
 9352            added_columns = [variant_id_column]
 9353
 9354            # variant_id, FORMAT and samples
 9355            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9356                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9357            )
 9358
 9359            # Create dataframe
 9360            dataframe_barcode = self.get_query_to_df(
 9361                f""" SELECT {samples_fields} FROM {table_variants} """
 9362            )
 9363
 9364            # Create barcode column
 9365            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9366                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
 9367            )
 9368
 9369            # Add barcode to header
 9370            vcf_reader.infos[tag] = vcf.parser._Info(
 9371                tag,
 9372                ".",
 9373                "String",
 9374                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
 9375                "howard calculation",
 9376                "0",
 9377                self.code_type_map.get("String"),
 9378            )
 9379
 9380            # Update
 9381            sql_update = f"""
 9382                UPDATE {table_variants}
 9383                SET "INFO" = 
 9384                    concat(
 9385                        CASE
 9386                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9387                            THEN ''
 9388                            ELSE concat("INFO", ';')
 9389                        END,
 9390                        CASE
 9391                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
 9392                            AND dataframe_barcode."{barcode_infos}" NOT NULL
 9393                            THEN concat(
 9394                                    '{tag}=',
 9395                                    dataframe_barcode."{barcode_infos}"
 9396                                )
 9397                            ELSE ''
 9398                        END
 9399                    )
 9400                FROM dataframe_barcode
 9401                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9402            """
 9403            self.conn.execute(sql_update)
 9404
 9405            # Remove added columns
 9406            for added_column in added_columns:
 9407                self.drop_column(column=added_column)
 9408
 9409            # Delete dataframe
 9410            del dataframe_barcode
 9411            gc.collect()
 9412
 9413    def calculation_barcode_family(self, tag: str = "BCF") -> None:
 9414        """
 9415        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
 9416        and updates the INFO field in the file with the calculated barcode values.
 9417
 9418        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
 9419        the barcode tag that will be added to the VCF file during the calculation process. If no value
 9420        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
 9421        :type tag: str (optional)
 9422        """
 9423
 9424        # if FORMAT and samples
 9425        if (
 9426            "FORMAT" in self.get_header_columns_as_list()
 9427            and self.get_header_sample_list()
 9428        ):
 9429
 9430            # barcode annotation field
 9431            if not tag:
 9432                tag = "BCF"
 9433
 9434            # VCF infos tags
 9435            vcf_infos_tags = {
 9436                tag: "barcode family calculation",
 9437                f"{tag}S": "barcode family samples",
 9438            }
 9439
 9440            # Param
 9441            param = self.get_param()
 9442            log.debug(f"param={param}")
 9443
 9444            # Prefix
 9445            prefix = self.get_explode_infos_prefix()
 9446
 9447            # PED param
 9448            ped = (
 9449                param.get("calculation", {})
 9450                .get("calculations", {})
 9451                .get("BARCODEFAMILY", {})
 9452                .get("family_pedigree", None)
 9453            )
 9454            log.debug(f"ped={ped}")
 9455
 9456            # Load PED
 9457            if ped:
 9458
 9459                # Pedigree is a file
 9460                if isinstance(ped, str) and os.path.exists(full_path(ped)):
 9461                    log.debug("Pedigree is file")
 9462                    with open(full_path(ped)) as ped:
 9463                        ped = yaml.safe_load(ped)
 9464
 9465                # Pedigree is a string
 9466                elif isinstance(ped, str):
 9467                    log.debug("Pedigree is str")
 9468                    try:
 9469                        ped = json.loads(ped)
 9470                        log.debug("Pedigree is json str")
 9471                    except ValueError as e:
 9472                        ped_samples = ped.split(",")
 9473                        ped = {}
 9474                        for ped_sample in ped_samples:
 9475                            ped[ped_sample] = ped_sample
 9476
 9477                # Pedigree is a dict
 9478                elif isinstance(ped, dict):
 9479                    log.debug("Pedigree is dict")
 9480
 9481                # Pedigree is not well formatted
 9482                else:
 9483                    msg_error = "Pedigree not well formatted"
 9484                    log.error(msg_error)
 9485                    raise ValueError(msg_error)
 9486
 9487                # Construct list
 9488                ped_samples = list(ped.values())
 9489
 9490            else:
 9491                log.debug("Pedigree not defined. Take all samples")
 9492                ped_samples = self.get_header_sample_list()
 9493                ped = {}
 9494                for ped_sample in ped_samples:
 9495                    ped[ped_sample] = ped_sample
 9496
 9497            # Check pedigree
 9498            if not ped or len(ped) == 0:
 9499                msg_error = f"Error in pedigree: samples {ped_samples}"
 9500                log.error(msg_error)
 9501                raise ValueError(msg_error)
 9502
 9503            # Log
 9504            log.info(
 9505                "Calculation 'BARCODEFAMILY' - Samples: "
 9506                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
 9507            )
 9508            log.debug(f"ped_samples={ped_samples}")
 9509
 9510            # Field
 9511            barcode_infos = prefix + tag
 9512
 9513            # Variants table
 9514            table_variants = self.get_table_variants()
 9515
 9516            # Header
 9517            vcf_reader = self.get_header()
 9518
 9519            # Create variant id
 9520            variant_id_column = self.get_variant_id_column()
 9521            added_columns = [variant_id_column]
 9522
 9523            # variant_id, FORMAT and samples
 9524            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9525                [f""" "{sample}" """ for sample in ped_samples]
 9526            )
 9527
 9528            # Create dataframe
 9529            dataframe_barcode = self.get_query_to_df(
 9530                f""" SELECT {samples_fields} FROM {table_variants} """
 9531            )
 9532
 9533            # Create barcode column
 9534            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9535                lambda row: barcode(row, samples=ped_samples), axis=1
 9536            )
 9537
 9538            # Add barcode family to header
 9539            # Add vaf_normalization to header
 9540            vcf_reader.formats[tag] = vcf.parser._Format(
 9541                id=tag,
 9542                num=".",
 9543                type="String",
 9544                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
 9545                type_code=self.code_type_map.get("String"),
 9546            )
 9547            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
 9548                id=f"{tag}S",
 9549                num=".",
 9550                type="String",
 9551                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
 9552                type_code=self.code_type_map.get("String"),
 9553            )
 9554
 9555            # Update
 9556            # for sample in ped_samples:
 9557            sql_update_set = []
 9558            for sample in self.get_header_sample_list() + ["FORMAT"]:
 9559                if sample in ped_samples:
 9560                    value = f'dataframe_barcode."{barcode_infos}"'
 9561                    value_samples = (
 9562                        "'"
 9563                        + ",".join([f""" "{sample}" """ for sample in ped_samples])
 9564                        + "'"
 9565                    )
 9566                    ped_samples
 9567                elif sample == "FORMAT":
 9568                    value = f"'{tag}'"
 9569                    value_samples = f"'{tag}S'"
 9570                else:
 9571                    value = "'.'"
 9572                    value_samples = "'.'"
 9573                format_regex = r"[a-zA-Z0-9\s]"
 9574                sql_update_set.append(
 9575                    f"""
 9576                        "{sample}" = 
 9577                        concat(
 9578                            CASE
 9579                                WHEN {table_variants}."{sample}" = './.'
 9580                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
 9581                                ELSE {table_variants}."{sample}"
 9582                            END,
 9583                            ':',
 9584                            {value},
 9585                            ':',
 9586                            {value_samples}
 9587                        )
 9588                    """
 9589                )
 9590
 9591            sql_update_set_join = ", ".join(sql_update_set)
 9592            sql_update = f"""
 9593                UPDATE {table_variants}
 9594                SET {sql_update_set_join}
 9595                FROM dataframe_barcode
 9596                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9597            """
 9598            self.conn.execute(sql_update)
 9599
 9600            # Remove added columns
 9601            for added_column in added_columns:
 9602                self.drop_column(column=added_column)
 9603
 9604            # Delete dataframe
 9605            del dataframe_barcode
 9606            gc.collect()
 9607
 9608    def calculation_trio(self) -> None:
 9609        """
 9610        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
 9611        information to the INFO field of each variant.
 9612        """
 9613
 9614        # if FORMAT and samples
 9615        if (
 9616            "FORMAT" in self.get_header_columns_as_list()
 9617            and self.get_header_sample_list()
 9618        ):
 9619
 9620            # trio annotation field
 9621            trio_tag = "trio"
 9622
 9623            # VCF infos tags
 9624            vcf_infos_tags = {
 9625                "trio": "trio calculation",
 9626            }
 9627
 9628            # Param
 9629            param = self.get_param()
 9630
 9631            # Prefix
 9632            prefix = self.get_explode_infos_prefix()
 9633
 9634            # Trio param
 9635            trio_ped = (
 9636                param.get("calculation", {})
 9637                .get("calculations", {})
 9638                .get("TRIO", {})
 9639                .get("trio_pedigree", None)
 9640            )
 9641
 9642            # Load trio
 9643            if trio_ped:
 9644
 9645                # Trio pedigree is a file
 9646                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
 9647                    log.debug("TRIO pedigree is file")
 9648                    with open(full_path(trio_ped)) as trio_ped:
 9649                        trio_ped = yaml.safe_load(trio_ped)
 9650
 9651                # Trio pedigree is a string
 9652                elif isinstance(trio_ped, str):
 9653                    log.debug("TRIO pedigree is str")
 9654                    try:
 9655                        trio_ped = json.loads(trio_ped)
 9656                        log.debug("TRIO pedigree is json str")
 9657                    except ValueError as e:
 9658                        trio_samples = trio_ped.split(",")
 9659                        if len(trio_samples) == 3:
 9660                            trio_ped = {
 9661                                "father": trio_samples[0],
 9662                                "mother": trio_samples[1],
 9663                                "child": trio_samples[2],
 9664                            }
 9665                            log.debug("TRIO pedigree is list str")
 9666                        else:
 9667                            msg_error = "TRIO pedigree not well formatted"
 9668                            log.error(msg_error)
 9669                            raise ValueError(msg_error)
 9670
 9671                # Trio pedigree is a dict
 9672                elif isinstance(trio_ped, dict):
 9673                    log.debug("TRIO pedigree is dict")
 9674
 9675                # Trio pedigree is not well formatted
 9676                else:
 9677                    msg_error = "TRIO pedigree not well formatted"
 9678                    log.error(msg_error)
 9679                    raise ValueError(msg_error)
 9680
 9681                # Construct trio list
 9682                trio_samples = [
 9683                    trio_ped.get("father", ""),
 9684                    trio_ped.get("mother", ""),
 9685                    trio_ped.get("child", ""),
 9686                ]
 9687
 9688            else:
 9689                log.debug("TRIO pedigree not defined. Take the first 3 samples")
 9690                samples_list = self.get_header_sample_list()
 9691                if len(samples_list) >= 3:
 9692                    trio_samples = self.get_header_sample_list()[0:3]
 9693                    trio_ped = {
 9694                        "father": trio_samples[0],
 9695                        "mother": trio_samples[1],
 9696                        "child": trio_samples[2],
 9697                    }
 9698                else:
 9699                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
 9700                    log.error(msg_error)
 9701                    raise ValueError(msg_error)
 9702
 9703            # Check trio pedigree
 9704            if not trio_ped or len(trio_ped) != 3:
 9705                msg_error = f"Error in TRIO pedigree: {trio_ped}"
 9706                log.error(msg_error)
 9707                raise ValueError(msg_error)
 9708
 9709            # Log
 9710            log.info(
 9711                f"Calculation 'TRIO' - Samples: "
 9712                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
 9713            )
 9714
 9715            # Field
 9716            trio_infos = prefix + trio_tag
 9717
 9718            # Variants table
 9719            table_variants = self.get_table_variants()
 9720
 9721            # Header
 9722            vcf_reader = self.get_header()
 9723
 9724            # Create variant id
 9725            variant_id_column = self.get_variant_id_column()
 9726            added_columns = [variant_id_column]
 9727
 9728            # variant_id, FORMAT and samples
 9729            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9730                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9731            )
 9732
 9733            # Create dataframe
 9734            dataframe_trio = self.get_query_to_df(
 9735                f""" SELECT {samples_fields} FROM {table_variants} """
 9736            )
 9737
 9738            # Create trio column
 9739            dataframe_trio[trio_infos] = dataframe_trio.apply(
 9740                lambda row: trio(row, samples=trio_samples), axis=1
 9741            )
 9742
 9743            # Add trio to header
 9744            vcf_reader.infos[trio_tag] = vcf.parser._Info(
 9745                trio_tag,
 9746                ".",
 9747                "String",
 9748                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
 9749                "howard calculation",
 9750                "0",
 9751                self.code_type_map.get("String"),
 9752            )
 9753
 9754            # Update
 9755            sql_update = f"""
 9756                UPDATE {table_variants}
 9757                SET "INFO" = 
 9758                    concat(
 9759                        CASE
 9760                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9761                            THEN ''
 9762                            ELSE concat("INFO", ';')
 9763                        END,
 9764                        CASE
 9765                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
 9766                             AND dataframe_trio."{trio_infos}" NOT NULL
 9767                            THEN concat(
 9768                                    '{trio_tag}=',
 9769                                    dataframe_trio."{trio_infos}"
 9770                                )
 9771                            ELSE ''
 9772                        END
 9773                    )
 9774                FROM dataframe_trio
 9775                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
 9776            """
 9777            self.conn.execute(sql_update)
 9778
 9779            # Remove added columns
 9780            for added_column in added_columns:
 9781                self.drop_column(column=added_column)
 9782
 9783            # Delete dataframe
 9784            del dataframe_trio
 9785            gc.collect()
 9786
 9787    def calculation_vaf_normalization(self) -> None:
 9788        """
 9789        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
 9790        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
 9791        :return: The function does not return anything.
 9792        """
 9793
 9794        # if FORMAT and samples
 9795        if (
 9796            "FORMAT" in self.get_header_columns_as_list()
 9797            and self.get_header_sample_list()
 9798        ):
 9799
 9800            # vaf_normalization annotation field
 9801            vaf_normalization_tag = "VAF"
 9802
 9803            # VCF infos tags
 9804            vcf_infos_tags = {
 9805                "VAF": "VAF Variant Frequency",
 9806            }
 9807
 9808            # Prefix
 9809            prefix = self.get_explode_infos_prefix()
 9810
 9811            # Variants table
 9812            table_variants = self.get_table_variants()
 9813
 9814            # Header
 9815            vcf_reader = self.get_header()
 9816
 9817            # Do not calculate if VAF already exists
 9818            if "VAF" in vcf_reader.formats:
 9819                log.debug("VAF already on genotypes")
 9820                return
 9821
 9822            # Create variant id
 9823            variant_id_column = self.get_variant_id_column()
 9824            added_columns = [variant_id_column]
 9825
 9826            # variant_id, FORMAT and samples
 9827            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9828                f""" "{sample}" """ for sample in self.get_header_sample_list()
 9829            )
 9830
 9831            # Create dataframe
 9832            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
 9833            log.debug(f"query={query}")
 9834            dataframe_vaf_normalization = self.get_query_to_df(query=query)
 9835
 9836            vaf_normalization_set = []
 9837
 9838            # for each sample vaf_normalization
 9839            for sample in self.get_header_sample_list():
 9840                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
 9841                    lambda row: vaf_normalization(row, sample=sample), axis=1
 9842                )
 9843                vaf_normalization_set.append(
 9844                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
 9845                )
 9846
 9847            # Add VAF to FORMAT
 9848            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
 9849                "FORMAT"
 9850            ].apply(lambda x: str(x) + ":VAF")
 9851            vaf_normalization_set.append(
 9852                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
 9853            )
 9854
 9855            # Add vaf_normalization to header
 9856            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
 9857                id=vaf_normalization_tag,
 9858                num="1",
 9859                type="Float",
 9860                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
 9861                type_code=self.code_type_map.get("Float"),
 9862            )
 9863
 9864            # Create fields to add in INFO
 9865            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
 9866
 9867            # Update
 9868            sql_update = f"""
 9869                UPDATE {table_variants}
 9870                SET {sql_vaf_normalization_set}
 9871                FROM dataframe_vaf_normalization
 9872                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
 9873
 9874            """
 9875            self.conn.execute(sql_update)
 9876
 9877            # Remove added columns
 9878            for added_column in added_columns:
 9879                self.drop_column(column=added_column)
 9880
 9881            # Delete dataframe
 9882            del dataframe_vaf_normalization
 9883            gc.collect()
 9884
 9885    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9886        """
 9887        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9888        field in a VCF file and updates the INFO column of the variants table with the calculated
 9889        statistics.
 9890
 9891        :param info: The `info` parameter is a string that represents the type of information for which
 9892        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9893        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9894        maximum value, the mean, the median, defaults to VAF
 9895        :type info: str (optional)
 9896        """
 9897
 9898        # if FORMAT and samples
 9899        if (
 9900            "FORMAT" in self.get_header_columns_as_list()
 9901            and self.get_header_sample_list()
 9902        ):
 9903
 9904            # vaf_stats annotation field
 9905            vaf_stats_tag = info + "_stats"
 9906
 9907            # VCF infos tags
 9908            vcf_infos_tags = {
 9909                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9910                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9911                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9912                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9913                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9914                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9915                info
 9916                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9917            }
 9918
 9919            # Prefix
 9920            prefix = self.get_explode_infos_prefix()
 9921
 9922            # Field
 9923            vaf_stats_infos = prefix + vaf_stats_tag
 9924
 9925            # Variants table
 9926            table_variants = self.get_table_variants()
 9927
 9928            # Header
 9929            vcf_reader = self.get_header()
 9930
 9931            # Create variant id
 9932            variant_id_column = self.get_variant_id_column()
 9933            added_columns = [variant_id_column]
 9934
 9935            # variant_id, FORMAT and samples
 9936            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9937                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9938            )
 9939
 9940            # Create dataframe
 9941            dataframe_vaf_stats = self.get_query_to_df(
 9942                f""" SELECT {samples_fields} FROM {table_variants} """
 9943            )
 9944
 9945            # Create vaf_stats column
 9946            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
 9947                lambda row: genotype_stats(
 9948                    row, samples=self.get_header_sample_list(), info=info
 9949                ),
 9950                axis=1,
 9951            )
 9952
 9953            # List of vcf tags
 9954            sql_vaf_stats_fields = []
 9955
 9956            # Check all VAF stats infos
 9957            for stat in vcf_infos_tags:
 9958
 9959                # Extract stats
 9960                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
 9961                    lambda x: dict(x).get(stat, "")
 9962                )
 9963
 9964                # Add snpeff_hgvs to header
 9965                vcf_reader.infos[stat] = vcf.parser._Info(
 9966                    stat,
 9967                    ".",
 9968                    "String",
 9969                    vcf_infos_tags.get(stat, "genotype statistics"),
 9970                    "howard calculation",
 9971                    "0",
 9972                    self.code_type_map.get("String"),
 9973                )
 9974
 9975                if len(sql_vaf_stats_fields):
 9976                    sep = ";"
 9977                else:
 9978                    sep = ""
 9979
 9980                # Create fields to add in INFO
 9981                sql_vaf_stats_fields.append(
 9982                    f"""
 9983                        CASE
 9984                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
 9985                            THEN concat(
 9986                                    '{sep}{stat}=',
 9987                                    dataframe_vaf_stats."{stat}"
 9988                                )
 9989                            ELSE ''
 9990                        END
 9991                    """
 9992                )
 9993
 9994            # SQL set for update
 9995            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
 9996
 9997            # Update
 9998            sql_update = f"""
 9999                UPDATE {table_variants}
10000                SET "INFO" = 
10001                    concat(
10002                        CASE
10003                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10004                            THEN ''
10005                            ELSE concat("INFO", ';')
10006                        END,
10007                        {sql_vaf_stats_fields_set}
10008                    )
10009                FROM dataframe_vaf_stats
10010                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
10011
10012            """
10013            self.conn.execute(sql_update)
10014
10015            # Remove added columns
10016            for added_column in added_columns:
10017                self.drop_column(column=added_column)
10018
10019            # Delete dataframe
10020            del dataframe_vaf_stats
10021            gc.collect()
10022
10023    def calculation_transcripts_annotation(
10024        self, info_json: str = None, info_format: str = None
10025    ) -> None:
10026        """
10027        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
10028        field to it if transcripts are available.
10029
10030        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
10031        is a string parameter that represents the information field to be used in the transcripts JSON.
10032        It is used to specify the JSON format for the transcripts information. If no value is provided
10033        when calling the method, it defaults to "
10034        :type info_json: str
10035        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
10036        method is a string parameter that specifies the format of the information field to be used in
10037        the transcripts JSON. It is used to define the format of the information field
10038        :type info_format: str
10039        """
10040
10041        # Create transcripts table
10042        transcripts_table = self.create_transcript_view()
10043
10044        # Add info field
10045        if transcripts_table:
10046            self.transcript_view_to_variants(
10047                transcripts_table=transcripts_table,
10048                transcripts_info_field_json=info_json,
10049                transcripts_info_field_format=info_format,
10050            )
10051        else:
10052            log.info("No Transcripts to process. Check param.json file configuration")
10053
10054    def calculation_transcripts_prioritization(self) -> None:
10055        """
10056        The function `calculation_transcripts_prioritization` creates a transcripts table and
10057        prioritizes transcripts based on certain criteria.
10058        """
10059
10060        # Create transcripts table
10061        transcripts_table = self.create_transcript_view()
10062
10063        # Add info field
10064        if transcripts_table:
10065            self.transcripts_prioritization(transcripts_table=transcripts_table)
10066        else:
10067            log.info("No Transcripts to process. Check param.json file configuration")
10068
10069    def calculation_transcripts_export(self) -> None:
10070        """ """
10071
10072        # Create transcripts table
10073        transcripts_table = self.create_transcript_view()
10074
10075        # Add info field
10076        if transcripts_table:
10077            self.transcripts_export(transcripts_table=transcripts_table)
10078        else:
10079            log.info("No Transcripts to process. Check param.json file configuration")
10080
10081    ###############
10082    # Transcripts #
10083    ###############
10084
10085    def transcripts_export(
10086        self, transcripts_table: str = None, param: dict = {}
10087    ) -> bool:
10088        """ """
10089
10090        log.debug("Start transcripts export...")
10091
10092        # Param
10093        if not param:
10094            param = self.get_param()
10095
10096        # Param export
10097        param_transcript_export = param.get("transcripts", {}).get("export", {})
10098
10099        # Output file
10100        transcripts_export_output = param_transcript_export.get("output", None)
10101
10102        if not param_transcript_export or not transcripts_export_output:
10103            log.warning(f"No transcriipts export parameters defined!")
10104            return False
10105
10106        # List of transcripts annotations
10107        query_describe = f"""
10108            SELECT column_name
10109            FROM (
10110                    DESCRIBE SELECT * FROM {transcripts_table}
10111                )
10112            WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO')
10113        """
10114        transcripts_annotations_list = list(
10115            self.get_query_to_df(query=query_describe)["column_name"]
10116        )
10117
10118        # Create transcripts table for export
10119        transcripts_table_export = f"{transcripts_table}_export_" + "".join(
10120            random.choices(string.ascii_uppercase + string.digits, k=10)
10121        )
10122        query_create_transcripts_table_export = f"""
10123            CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table})
10124        """
10125        self.execute_query(query=query_create_transcripts_table_export)
10126
10127        # Output file format
10128        transcripts_export_output_format = get_file_format(
10129            filename=transcripts_export_output
10130        )
10131
10132        # Format VCF - construct INFO
10133        if transcripts_export_output_format in ["vcf"]:
10134
10135            # Construct query update INFO and header
10136            query_update_info = []
10137            for field in transcripts_annotations_list:
10138
10139                # If field not in header
10140                if field not in self.get_header_infos_list():
10141
10142                    # Add PZ Transcript in header
10143                    self.get_header().infos[field] = vcf.parser._Info(
10144                        field,
10145                        ".",
10146                        "String",
10147                        f"Annotation '{field}' from transcript view",
10148                        "unknown",
10149                        "unknown",
10150                        0,
10151                    )
10152
10153                # Add field as INFO/tag
10154                query_update_info.append(
10155                    f"""
10156                        CASE
10157                            WHEN "{field}" IS NOT NULL
10158                            THEN concat('{field}=', "{field}", ';')    
10159                            ELSE ''     
10160                        END
10161                        """
10162                )
10163
10164            # Query param
10165            query_update_info_value = (
10166                f""" concat('',  {", ".join(query_update_info)}) """
10167            )
10168            query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """
10169
10170        else:
10171
10172            # Query param
10173            query_update_info_value = f""" NULL """
10174            query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """
10175
10176        # Update query INFO column
10177        query_update = f"""
10178            UPDATE {transcripts_table_export}
10179            SET INFO = {query_update_info_value}
10180
10181        """
10182        self.execute_query(query=query_update)
10183
10184        # Export
10185        self.export_output(
10186            output_file=transcripts_export_output,
10187            query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """,
10188        )
10189
10190        # Drop transcripts export table
10191        query_drop_transcripts_table_export = f"""
10192            DROP TABLE {transcripts_table_export}
10193        """
10194        self.execute_query(query=query_drop_transcripts_table_export)
10195
10196    def transcripts_prioritization(
10197        self, transcripts_table: str = None, param: dict = {}
10198    ) -> bool:
10199        """
10200        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
10201        and updates the variants table with the prioritized information.
10202
10203        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10204        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
10205        This parameter is used to identify the table where the transcripts data is stored for the
10206        prioritization process
10207        :type transcripts_table: str
10208        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
10209        that contains various configuration settings for the prioritization process of transcripts. It
10210        is used to customize the behavior of the prioritization algorithm and includes settings such as
10211        the prefix for prioritization fields, default profiles, and other
10212        :type param: dict
10213        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
10214        transcripts prioritization process is successfully completed, and `False` if there are any
10215        issues or if no profile is defined for transcripts prioritization.
10216        """
10217
10218        log.debug("Start transcripts prioritization...")
10219
10220        # Param
10221        if not param:
10222            param = self.get_param()
10223
10224        # Variants table
10225        table_variants = self.get_table_variants()
10226
10227        # Transcripts table
10228        if transcripts_table is None:
10229            transcripts_table = self.create_transcript_view(
10230                transcripts_table="transcripts", param=param
10231            )
10232        if transcripts_table is None:
10233            msg_err = "No Transcripts table availalble"
10234            log.error(msg_err)
10235            raise ValueError(msg_err)
10236        log.debug(f"transcripts_table={transcripts_table}")
10237
10238        # Get transcripts columns
10239        columns_as_list_query = f"""
10240            DESCRIBE {transcripts_table}
10241        """
10242        columns_as_list = list(
10243            self.get_query_to_df(columns_as_list_query)["column_name"]
10244        )
10245
10246        # Create INFO if not exists
10247        if "INFO" not in columns_as_list:
10248            query_add_info = f"""
10249                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
10250            """
10251            self.execute_query(query_add_info)
10252
10253        # Prioritization param and Force only PZ Score and Flag
10254        pz_param = param.get("transcripts", {}).get("prioritization", {})
10255
10256        # PZ profile by default
10257        pz_profile_default = (
10258            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
10259        )
10260
10261        # Exit if no profile
10262        if pz_profile_default is None:
10263            log.warning("No profile defined for transcripts prioritization")
10264            return False
10265
10266        # PZ fields
10267        pz_param_pzfields = {}
10268
10269        # PZ field transcripts
10270        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
10271
10272        # Add PZ Transcript in header
10273        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
10274            pz_fields_transcripts,
10275            ".",
10276            "String",
10277            f"Transcript selected from prioritization process, profile {pz_profile_default}",
10278            "unknown",
10279            "unknown",
10280            code_type_map["String"],
10281        )
10282
10283        # Mandatory fields
10284        pz_mandatory_fields_list = [
10285            "Score",
10286            "Flag",
10287            "Tags",
10288            "Comment",
10289            "Infos",
10290            "Class",
10291        ]
10292        pz_mandatory_fields = []
10293        for pz_mandatory_field in pz_mandatory_fields_list:
10294            pz_mandatory_fields.append(
10295                pz_param.get("pzprefix", "PTZ") + pz_mandatory_field
10296            )
10297
10298        # PZ fields in param
10299        for pz_field in pz_param.get("pzfields", []):
10300            if pz_field in pz_mandatory_fields_list:
10301                pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = (
10302                    pz_param.get("pzprefix", "PTZ") + pz_field
10303                )
10304            else:
10305                pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field
10306                pz_param_pzfields[pz_field] = pz_field_new
10307
10308                # Add PZ Transcript in header
10309                self.get_header().infos[pz_field_new] = vcf.parser._Info(
10310                    pz_field_new,
10311                    ".",
10312                    "String",
10313                    f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}",
10314                    "unknown",
10315                    "unknown",
10316                    code_type_map["String"],
10317                )
10318
10319        # PZ fields param
10320        pz_param["pzfields"] = pz_mandatory_fields
10321
10322        # Prioritization
10323        prioritization_result = self.prioritization(
10324            table=transcripts_table,
10325            pz_param=param.get("transcripts", {}).get("prioritization", {}),
10326        )
10327        if not prioritization_result:
10328            log.warning("Transcripts prioritization not processed")
10329            return False
10330
10331        # PZ fields sql query
10332        query_update_select_list = []
10333        query_update_concat_list = []
10334        query_update_order_list = []
10335        for pz_param_pzfield in set(
10336            list(pz_param_pzfields.keys()) + pz_mandatory_fields
10337        ):
10338            query_update_select_list.append(f" {pz_param_pzfield}, ")
10339
10340        for pz_param_pzfield in pz_param_pzfields:
10341            query_update_concat_list.append(
10342                f"""
10343                    , CASE 
10344                        WHEN {pz_param_pzfield} IS NOT NULL
10345                        THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield})
10346                        ELSE ''
10347                    END
10348                """
10349            )
10350
10351        # Order by
10352        pz_orders = (
10353            param.get("transcripts", {})
10354            .get("prioritization", {})
10355            .get("prioritization_transcripts_order", {})
10356        )
10357        if not pz_orders:
10358            pz_orders = {
10359                pz_param.get("pzprefix", "PTZ") + "Flag": "DESC",
10360                pz_param.get("pzprefix", "PTZ") + "Score": "DESC",
10361            }
10362        for pz_order in pz_orders:
10363            query_update_order_list.append(
10364                f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """
10365            )
10366
10367        # Fields to explode
10368        fields_to_explode = (
10369            list(pz_param_pzfields.keys())
10370            + pz_mandatory_fields
10371            + list(pz_orders.keys())
10372        )
10373        # Remove transcript column as a specific transcript column
10374        if "transcript" in fields_to_explode:
10375            fields_to_explode.remove("transcript")
10376
10377        # Fields intranscripts table
10378        query_transcripts_table = f"""
10379            DESCRIBE SELECT * FROM {transcripts_table}
10380        """
10381        query_transcripts_table = self.get_query_to_df(query=query_transcripts_table)
10382
10383        # Check fields to explode
10384        for field_to_explode in fields_to_explode:
10385            if field_to_explode not in self.get_header_infos_list() + list(
10386                query_transcripts_table.column_name
10387            ):
10388                msg_err = f"INFO/{field_to_explode} NOT IN header"
10389                log.error(msg_err)
10390                raise ValueError(msg_err)
10391
10392        # Explode fields to explode
10393        self.explode_infos(
10394            table=transcripts_table,
10395            fields=fields_to_explode,
10396        )
10397
10398        # Transcript preference file
10399        transcripts_preference_file = (
10400            param.get("transcripts", {})
10401            .get("prioritization", {})
10402            .get("prioritization_transcripts", {})
10403        )
10404        transcripts_preference_file = full_path(transcripts_preference_file)
10405
10406        # Transcript preference forced
10407        transcript_preference_force = (
10408            param.get("transcripts", {})
10409            .get("prioritization", {})
10410            .get("prioritization_transcripts_force", False)
10411        )
10412        # Transcript version forced
10413        transcript_version_force = (
10414            param.get("transcripts", {})
10415            .get("prioritization", {})
10416            .get("prioritization_transcripts_version_force", False)
10417        )
10418
10419        # Transcripts Ranking
10420        if transcripts_preference_file:
10421
10422            # Transcripts file to dataframe
10423            if os.path.exists(transcripts_preference_file):
10424                transcripts_preference_dataframe = transcripts_file_to_df(
10425                    transcripts_preference_file
10426                )
10427            else:
10428                log.error(
10429                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10430                )
10431                raise ValueError(
10432                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10433                )
10434
10435            # Order by depending to transcript preference forcing
10436            if transcript_preference_force:
10437                order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """
10438            else:
10439                order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """
10440
10441            # Transcript columns joined depend on version consideration
10442            if transcript_version_force:
10443                transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """
10444            else:
10445                transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """
10446
10447            # Query ranking for update
10448            query_update_ranking = f"""
10449                SELECT
10450                    "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)}
10451                    ROW_NUMBER() OVER (
10452                        PARTITION BY "#CHROM", POS, REF, ALT
10453                        ORDER BY {order_by}
10454                    ) AS rn
10455                FROM {transcripts_table}
10456                LEFT JOIN 
10457                    (
10458                        SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order
10459                        FROM transcripts_preference_dataframe
10460                    ) AS transcripts_preference
10461                ON {transcripts_version_join}
10462            """
10463
10464        else:
10465
10466            # Query ranking for update
10467            query_update_ranking = f"""
10468                SELECT
10469                    "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)}
10470                    ROW_NUMBER() OVER (
10471                        PARTITION BY "#CHROM", POS, REF, ALT
10472                        ORDER BY {" , ".join(query_update_order_list)}
10473                    ) AS rn
10474                FROM {transcripts_table}
10475            """
10476
10477        # Export Transcripts prioritization infos to variants table
10478        query_update = f"""
10479            WITH RankedTranscripts AS (
10480                {query_update_ranking}
10481            )
10482            UPDATE {table_variants}
10483                SET
10484                INFO = CONCAT(CASE
10485                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10486                            THEN ''
10487                            ELSE concat("INFO", ';')
10488                        END,
10489                        concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)})
10490                        )
10491            FROM
10492                RankedTranscripts
10493            WHERE
10494                rn = 1
10495                AND variants."#CHROM" = RankedTranscripts."#CHROM"
10496                AND variants."POS" = RankedTranscripts."POS"
10497                AND variants."REF" = RankedTranscripts."REF"
10498                AND variants."ALT" = RankedTranscripts."ALT"     
10499        """
10500
10501        # log.debug(f"query_update={query_update}")
10502        self.execute_query(query=query_update)
10503
10504        # Return
10505        return True
10506
10507    def create_transcript_view_from_columns_map(
10508        self,
10509        transcripts_table: str = "transcripts",
10510        columns_maps: dict = {},
10511        added_columns: list = [],
10512        temporary_tables: list = None,
10513        annotation_fields: list = None,
10514        column_rename: dict = {},
10515        column_clean: bool = False,
10516        column_case: str = None,
10517    ) -> tuple[list, list, list]:
10518        """
10519        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
10520        specified columns mapping for transcripts data.
10521
10522        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10523        of the table where the transcripts data is stored or will be stored in the database. This table
10524        typically contains information about transcripts such as Ensembl transcript IDs, gene names,
10525        scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
10526        :type transcripts_table: str (optional)
10527        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information
10528        about how to map columns from a transcripts table to create a view. Each entry in the
10529        `columns_maps` list represents a mapping configuration for a specific set of columns. It
10530        typically includes details such as the main transcript column and additional information columns
10531        :type columns_maps: dict
10532        :param added_columns: The `added_columns` parameter in the
10533        `create_transcript_view_from_columns_map` function is a list that stores the additional columns
10534        that will be added to the view being created based on the columns map provided. These columns
10535        are generated by exploding the transcript information columns along with the main transcript
10536        column
10537        :type added_columns: list
10538        :param temporary_tables: The `temporary_tables` parameter in the
10539        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
10540        tables created during the process of creating a transcript view from a columns map. These
10541        temporary tables are used to store intermediate results or transformations before the final view
10542        is generated
10543        :type temporary_tables: list
10544        :param annotation_fields: The `annotation_fields` parameter in the
10545        `create_transcript_view_from_columns_map` function is a list that stores the fields that are
10546        used for annotation in the query view creation process. These fields are extracted from the
10547        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
10548        :type annotation_fields: list
10549        :param column_rename: The `column_rename` parameter in the
10550        `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify
10551        custom renaming for columns during the creation of the temporary table view. This parameter
10552        provides a mapping of original column names to the desired renamed column names. By using this
10553        parameter,
10554        :type column_rename: dict
10555        :param column_clean: The `column_clean` parameter in the
10556        `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the
10557        column values should be cleaned or not. If set to `True`, the column values will be cleaned by
10558        removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to
10559        False
10560        :type column_clean: bool (optional)
10561        :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map`
10562        function is used to specify the case transformation to be applied to the columns during the view
10563        creation process. It allows you to control whether the column values should be converted to
10564        lowercase, uppercase, or remain unchanged
10565        :type column_case: str
10566        :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three
10567        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
10568        """
10569
10570        log.debug("Start transcrpts view creation from columns map...")
10571
10572        # "from_columns_map": [
10573        #     {
10574        #         "transcripts_column": "Ensembl_transcriptid",
10575        #         "transcripts_infos_columns": [
10576        #             "genename",
10577        #             "Ensembl_geneid",
10578        #             "LIST_S2_score",
10579        #             "LIST_S2_pred",
10580        #         ],
10581        #     },
10582        #     {
10583        #         "transcripts_column": "Ensembl_transcriptid",
10584        #         "transcripts_infos_columns": [
10585        #             "genename",
10586        #             "VARITY_R_score",
10587        #             "Aloft_pred",
10588        #         ],
10589        #     },
10590        # ],
10591
10592        # Init
10593        if temporary_tables is None:
10594            temporary_tables = []
10595        if annotation_fields is None:
10596            annotation_fields = []
10597
10598        # Variants table
10599        table_variants = self.get_table_variants()
10600
10601        for columns_map in columns_maps:
10602
10603            # Log
10604            log.debug(f"columns_map={columns_map}")
10605
10606            # Transcript column
10607            transcripts_column = columns_map.get("transcripts_column", None)
10608
10609            # Transcripts infos columns
10610            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
10611
10612            # Transcripts infos columns rename
10613            column_rename = columns_map.get("column_rename", column_rename)
10614
10615            # Transcripts infos columns clean
10616            column_clean = columns_map.get("column_clean", column_clean)
10617
10618            # Transcripts infos columns case
10619            column_case = columns_map.get("column_case", column_case)
10620
10621            if transcripts_column is not None:
10622
10623                # Explode
10624                added_columns += self.explode_infos(
10625                    fields=[transcripts_column] + transcripts_infos_columns
10626                )
10627
10628                # View clauses
10629                clause_select_variants = []
10630                clause_select_tanscripts = []
10631                for field in [transcripts_column] + transcripts_infos_columns:
10632
10633                    # AS field
10634                    as_field = field
10635
10636                    # Rename
10637                    if column_rename:
10638                        as_field = column_rename.get(as_field, as_field)
10639
10640                    # Clean
10641                    if column_clean:
10642                        as_field = clean_annotation_field(as_field)
10643
10644                    # Case
10645                    if column_case:
10646                        if column_case.lower() in ["lower"]:
10647                            as_field = as_field.lower()
10648                        elif column_case.lower() in ["upper"]:
10649                            as_field = as_field.upper()
10650
10651                    # Clause select Variants
10652                    clause_select_variants.append(
10653                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10654                    )
10655
10656                    if field in [transcripts_column]:
10657                        clause_select_tanscripts.append(
10658                            f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10659                        )
10660                    else:
10661                        clause_select_tanscripts.append(
10662                            f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """
10663                        )
10664                        annotation_fields.append(as_field)
10665
10666                # Query View
10667                query = f""" 
10668                    SELECT
10669                        "#CHROM", POS, REF, ALT, INFO,
10670                        "{transcripts_column}" AS 'transcript',
10671                        {", ".join(clause_select_tanscripts)}
10672                    FROM (
10673                        SELECT 
10674                            "#CHROM", POS, REF, ALT, INFO,
10675                            {", ".join(clause_select_variants)}
10676                        FROM {table_variants}
10677                        )
10678                    WHERE "{transcripts_column}" IS NOT NULL
10679                """
10680
10681                # Create temporary table
10682                temporary_table = transcripts_table + "".join(
10683                    random.choices(string.ascii_uppercase + string.digits, k=10)
10684                )
10685
10686                # # Temporary_tables
10687                # temporary_tables.append(temporary_table)
10688                # query_view = f"""
10689                #     CREATE TEMPORARY TABLE {temporary_table}
10690                #     AS ({query})
10691                # """
10692                # self.execute_query(query=query_view)
10693
10694                # Temporary_tables
10695                temporary_tables.append(temporary_table)
10696
10697                # List of unique #CHROM
10698                query_unique_chrom = f"""
10699                    SELECT DISTINCT "#CHROM"
10700                    FROM variants
10701                """
10702                unique_chroms = self.get_query_to_df(query=query_unique_chrom)
10703
10704                # Create table with structure but without data
10705                query_create_table = f"""
10706                    CREATE TABLE {temporary_table}
10707                    AS ({query} LIMIT 0)
10708                """
10709                self.execute_query(query=query_create_table)
10710
10711                # Process by #CHROM
10712                for chrom in unique_chroms["#CHROM"]:
10713
10714                    # Log
10715                    log.debug(f"Processing #CHROM={chrom}")
10716
10717                    # Select data by #CHROM
10718                    query_chunk = f"""
10719                        SELECT *
10720                        FROM ({query})
10721                        WHERE "#CHROM" = '{chrom}'
10722                    """
10723
10724                    # Insert data
10725                    query_insert_chunk = f"""
10726                        INSERT INTO {temporary_table}
10727                        {query_chunk}
10728                    """
10729                    self.execute_query(query=query_insert_chunk)
10730
10731        return added_columns, temporary_tables, annotation_fields
10732
10733    def create_transcript_view_from_column_format(
10734        self,
10735        transcripts_table: str = "transcripts",
10736        column_formats: dict = {},
10737        temporary_tables: list = None,
10738        annotation_fields: list = None,
10739        column_rename: dict = {},
10740        column_clean: bool = False,
10741        column_case: str = None,
10742    ) -> tuple[list, list, list]:
10743        """
10744        The `create_transcript_view_from_column_format` function generates a transcript view based on
10745        specified column formats, adds additional columns and annotation fields, and returns the list of
10746        temporary tables and annotation fields.
10747
10748        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10749        of the table containing the transcripts data. This table will be used as the base table for
10750        creating the transcript view. The default value for this parameter is "transcripts", but you can
10751        provide a different table name if needed, defaults to transcripts
10752        :type transcripts_table: str (optional)
10753        :param column_formats: The `column_formats` parameter is a dictionary that contains information
10754        about the columns to be used for creating the transcript view. Each entry in the dictionary
10755        specifies the mapping between a transcripts column and a transcripts infos column. This
10756        parameter allows you to define how the columns from the transcripts table should be transformed
10757        or mapped
10758        :type column_formats: dict
10759        :param temporary_tables: The `temporary_tables` parameter in the
10760        `create_transcript_view_from_column_format` function is a list that stores the names of
10761        temporary views created during the process of creating a transcript view from a column format.
10762        These temporary views are used to manipulate and extract data before generating the final
10763        transcript view
10764        :type temporary_tables: list
10765        :param annotation_fields: The `annotation_fields` parameter in the
10766        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
10767        that are extracted from the temporary views created during the process. These annotation fields
10768        are obtained by querying the temporary views and extracting the column names excluding specific
10769        columns like `#CH
10770        :type annotation_fields: list
10771        :param column_rename: The `column_rename` parameter in the
10772        `create_transcript_view_from_column_format` function is a dictionary that allows you to specify
10773        custom renaming of columns in the transcripts infos table. By providing a mapping of original
10774        column names to new column names in this dictionary, you can rename specific columns during the
10775        process
10776        :type column_rename: dict
10777        :param column_clean: The `column_clean` parameter in the
10778        `create_transcript_view_from_column_format` function is a boolean flag that determines whether
10779        the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns
10780        will be cleaned during the creation of the transcript view based on the specified column format,
10781        defaults to False
10782        :type column_clean: bool (optional)
10783        :param column_case: The `column_case` parameter in the
10784        `create_transcript_view_from_column_format` function is used to specify the case transformation
10785        to be applied to the columns in the transcript view. It can be set to either "upper" or "lower"
10786        to convert the column names to uppercase or lowercase, respectively
10787        :type column_case: str
10788        :return: The `create_transcript_view_from_column_format` function returns two lists:
10789        `temporary_tables` and `annotation_fields`.
10790        """
10791
10792        log.debug("Start transcrpts view creation from column format...")
10793
10794        #  "from_column_format": [
10795        #     {
10796        #         "transcripts_column": "ANN",
10797        #         "transcripts_infos_column": "Feature_ID",
10798        #     }
10799        # ],
10800
10801        # Init
10802        if temporary_tables is None:
10803            temporary_tables = []
10804        if annotation_fields is None:
10805            annotation_fields = []
10806
10807        for column_format in column_formats:
10808
10809            # annotation field and transcript annotation field
10810            annotation_field = column_format.get("transcripts_column", "ANN")
10811            transcript_annotation = column_format.get(
10812                "transcripts_infos_column", "Feature_ID"
10813            )
10814
10815            # Transcripts infos columns rename
10816            column_rename = column_format.get("column_rename", column_rename)
10817
10818            # Transcripts infos columns clean
10819            column_clean = column_format.get("column_clean", column_clean)
10820
10821            # Transcripts infos columns case
10822            column_case = column_format.get("column_case", column_case)
10823
10824            # Temporary View name
10825            temporary_view_name = transcripts_table + "".join(
10826                random.choices(string.ascii_uppercase + string.digits, k=10)
10827            )
10828
10829            # Create temporary view name
10830            temporary_view_name = self.annotation_format_to_table(
10831                uniquify=True,
10832                annotation_field=annotation_field,
10833                view_name=temporary_view_name,
10834                annotation_id=transcript_annotation,
10835                column_rename=column_rename,
10836                column_clean=column_clean,
10837                column_case=column_case,
10838            )
10839
10840            # Annotation fields
10841            if temporary_view_name:
10842                query_annotation_fields = f"""
10843                    SELECT *
10844                    FROM (
10845                        DESCRIBE SELECT *
10846                        FROM {temporary_view_name}
10847                        )
10848                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
10849                """
10850                df_annotation_fields = self.get_query_to_df(
10851                    query=query_annotation_fields
10852                )
10853
10854                # Add temporary view and annotation fields
10855                temporary_tables.append(temporary_view_name)
10856                annotation_fields += list(set(df_annotation_fields["column_name"]))
10857
10858        return temporary_tables, annotation_fields
10859
10860    def create_transcript_view(
10861        self,
10862        transcripts_table: str = None,
10863        transcripts_table_drop: bool = False,
10864        param: dict = {},
10865    ) -> str:
10866        """
10867        The `create_transcript_view` function generates a transcript view by processing data from a
10868        specified table based on provided parameters and structural information.
10869
10870        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
10871        is used to specify the name of the table that will store the final transcript view data. If a table
10872        name is not provided, the function will create a new table to store the transcript view data, and by
10873        default,, defaults to transcripts
10874        :type transcripts_table: str (optional)
10875        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
10876        `create_transcript_view` function is a boolean parameter that determines whether to drop the
10877        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
10878        the function will drop the existing transcripts table if it exists, defaults to False
10879        :type transcripts_table_drop: bool (optional)
10880        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
10881        contains information needed to create a transcript view. It includes details such as the structure
10882        of the transcripts, columns mapping, column formats, and other necessary information for generating
10883        the view. This parameter allows for flexibility and customization
10884        :type param: dict
10885        :return: The `create_transcript_view` function returns the name of the transcripts table that was
10886        created or modified during the execution of the function.
10887        """
10888
10889        log.debug("Start transcripts view creation...")
10890
10891        # Default
10892        transcripts_table_default = "transcripts"
10893
10894        # Param
10895        if not param:
10896            param = self.get_param()
10897
10898        # Struct
10899        struct = param.get("transcripts", {}).get("struct", None)
10900
10901        # Transcript veresion
10902        transcript_id_remove_version = param.get("transcripts", {}).get(
10903            "transcript_id_remove_version", False
10904        )
10905
10906        # Transcripts mapping
10907        transcript_id_mapping_file = param.get("transcripts", {}).get(
10908            "transcript_id_mapping_file", None
10909        )
10910
10911        # Transcripts mapping
10912        transcript_id_mapping_force = param.get("transcripts", {}).get(
10913            "transcript_id_mapping_force", None
10914        )
10915
10916        # Transcripts table
10917        if transcripts_table is None:
10918            transcripts_table = param.get("transcripts", {}).get(
10919                "table", transcripts_table_default
10920            )
10921
10922        # Check transcripts table exists
10923        if transcripts_table:
10924
10925            # Query to check if transcripts table exists
10926            query_check_table = f"""
10927                SELECT * 
10928                FROM information_schema.tables 
10929                WHERE table_name = '{transcripts_table}'
10930            """
10931            df_check_table = self.get_query_to_df(query=query_check_table)
10932
10933            # Check if transcripts table exists
10934            if len(df_check_table) > 0 and not transcripts_table_drop:
10935                log.debug(f"Table {transcripts_table} exists and not drop option")
10936                return transcripts_table
10937
10938        if struct:
10939
10940            # added_columns
10941            added_columns = []
10942
10943            # Temporary tables
10944            temporary_tables = []
10945
10946            # Annotation fields
10947            annotation_fields = []
10948
10949            # from columns map
10950            columns_maps = struct.get("from_columns_map", [])
10951            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10952                self.create_transcript_view_from_columns_map(
10953                    transcripts_table=transcripts_table,
10954                    columns_maps=columns_maps,
10955                    added_columns=added_columns,
10956                    temporary_tables=temporary_tables,
10957                    annotation_fields=annotation_fields,
10958                )
10959            )
10960            added_columns += added_columns_tmp
10961            temporary_tables += temporary_tables_tmp
10962            annotation_fields += annotation_fields_tmp
10963
10964            # from column format
10965            column_formats = struct.get("from_column_format", [])
10966            temporary_tables_tmp, annotation_fields_tmp = (
10967                self.create_transcript_view_from_column_format(
10968                    transcripts_table=transcripts_table,
10969                    column_formats=column_formats,
10970                    temporary_tables=temporary_tables,
10971                    annotation_fields=annotation_fields,
10972                )
10973            )
10974            temporary_tables += temporary_tables_tmp
10975            annotation_fields += annotation_fields_tmp
10976
10977            # Remove some specific fields/column
10978            annotation_fields = list(set(annotation_fields))
10979            for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]:
10980                if field in annotation_fields:
10981                    annotation_fields.remove(field)
10982
10983            # Merge temporary tables query
10984            query_merge = ""
10985            for temporary_table in list(set(temporary_tables)):
10986
10987                # First temporary table
10988                if not query_merge:
10989                    query_merge = f"""
10990                        SELECT * FROM {temporary_table}
10991                    """
10992                # other temporary table (using UNION)
10993                else:
10994                    query_merge += f"""
10995                        UNION BY NAME SELECT * FROM {temporary_table}
10996                    """
10997
10998            # transcript table tmp
10999            transcript_table_tmp = "transcripts_tmp"
11000            transcript_table_tmp2 = "transcripts_tmp2"
11001            transcript_table_tmp3 = "transcripts_tmp3"
11002
11003            # Merge on transcript
11004            query_merge_on_transcripts_annotation_fields = []
11005
11006            # Add transcript list
11007            query_merge_on_transcripts_annotation_fields.append(
11008                f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """
11009            )
11010
11011            # Aggregate all annotations fields
11012            for annotation_field in set(annotation_fields):
11013                query_merge_on_transcripts_annotation_fields.append(
11014                    f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """
11015                )
11016
11017            # Transcripts mapping
11018            if transcript_id_mapping_file:
11019
11020                # Transcript dataframe
11021                transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe"
11022                transcript_id_mapping_dataframe = transcripts_file_to_df(
11023                    transcript_id_mapping_file, column_names=["transcript", "alias"]
11024                )
11025
11026                # Transcript version remove
11027                if transcript_id_remove_version:
11028                    query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped"
11029                    query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)"
11030                    query_left_join = f"""
11031                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
11032                    """
11033                else:
11034                    query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped"
11035                    query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript"
11036                    query_left_join = f"""
11037                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
11038                    """
11039
11040                # Transcript column for group by merge
11041                query_transcript_merge_group_by = """
11042                        CASE
11043                            WHEN transcript_mapped NOT IN ('')
11044                            THEN split_part(transcript_mapped, '.', 1)
11045                            ELSE split_part(transcript_original, '.', 1)
11046                        END
11047                    """
11048
11049                # Merge query
11050                transcripts_tmp2_query = f"""
11051                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)}
11052                    FROM ({query_merge}) AS {transcript_table_tmp}
11053                    {query_left_join}
11054                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by}
11055                """
11056
11057                # Retrive columns after mege
11058                transcripts_tmp2_describe_query = f"""
11059                    DESCRIBE {transcripts_tmp2_query}
11060                """
11061                transcripts_tmp2_describe_list = list(
11062                    self.get_query_to_df(query=transcripts_tmp2_describe_query)[
11063                        "column_name"
11064                    ]
11065                )
11066
11067                # Create list of columns for select clause
11068                transcripts_tmp2_describe_select_clause = []
11069                for field in transcripts_tmp2_describe_list:
11070                    if field not in [
11071                        "#CHROM",
11072                        "POS",
11073                        "REF",
11074                        "ALT",
11075                        "INFO",
11076                        "transcript_mapped",
11077                    ]:
11078                        as_field = field
11079                        if field in ["transcript_original"]:
11080                            as_field = "transcripts_mapped"
11081                        transcripts_tmp2_describe_select_clause.append(
11082                            f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """
11083                        )
11084
11085                # Merge with mapping
11086                query_merge_on_transcripts = f"""
11087                    SELECT
11088                        "#CHROM", POS, REF, ALT, INFO,
11089                        CASE
11090                            WHEN ANY_VALUE(transcript_mapped) NOT IN ('')
11091                            THEN ANY_VALUE(transcript_mapped)
11092                            ELSE ANY_VALUE(transcript_original)
11093                        END AS transcript,
11094                        {", ".join(transcripts_tmp2_describe_select_clause)}
11095                    FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2}
11096                    GROUP BY "#CHROM", POS, REF, ALT, INFO,
11097                        {query_transcript_merge_group_by}
11098                """
11099
11100                # Add transcript filter from mapping file
11101                if transcript_id_mapping_force:
11102                    query_merge_on_transcripts = f"""
11103                        SELECT *
11104                        FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3}
11105                        WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe)
11106                    """
11107
11108            # No transcript mapping
11109            else:
11110
11111                # Remove transcript version
11112                if transcript_id_remove_version:
11113                    query_transcript_column = f"""
11114                        split_part({transcript_table_tmp}.transcript, '.', 1)
11115                    """
11116                else:
11117                    query_transcript_column = """
11118                        transcript
11119                    """
11120
11121                # Query sections
11122                query_transcript_column_select = (
11123                    f"{query_transcript_column} AS transcript"
11124                )
11125                query_transcript_column_group_by = query_transcript_column
11126
11127                # Query for transcripts view
11128                query_merge_on_transcripts = f"""
11129                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)}
11130                    FROM ({query_merge}) AS {transcript_table_tmp}
11131                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column}
11132                """
11133
11134            # Drop transcript view is necessary
11135            if transcripts_table_drop:
11136                query_drop = f"""
11137                    DROP TABLE IF EXISTS {transcripts_table};
11138                """
11139                self.execute_query(query=query_drop)
11140
11141            # # Merge and create transcript view
11142            # query_create_view = f"""
11143            #     CREATE TABLE IF NOT EXISTS {transcripts_table}
11144            #     AS {query_merge_on_transcripts}
11145            # """
11146            # self.execute_query(query=query_create_view)
11147
11148            # Using #CHROM chunk
11149            ######
11150
11151            # List of unique #CHROM
11152            query_unique_chrom = f"""
11153                SELECT DISTINCT "#CHROM"
11154                FROM variants AS subquery
11155            """
11156            unique_chroms = self.get_query_to_df(query=query_unique_chrom)
11157
11158            # Create table with structure but without data, if not exists
11159            query_create_table = f"""
11160                CREATE TABLE IF NOT EXISTS {transcripts_table} AS
11161                SELECT * FROM ({query_merge_on_transcripts}) AS subquery LIMIT 0
11162            """
11163            self.execute_query(query=query_create_table)
11164
11165            # Process by #CHROM
11166            for chrom in unique_chroms["#CHROM"]:
11167
11168                # Log
11169                log.debug(f"Processing #CHROM={chrom}")
11170
11171                # Select data by #CHROM
11172                query_chunk = f"""
11173                    SELECT *
11174                    FROM ({query_merge_on_transcripts})
11175                    WHERE "#CHROM" = '{chrom}'
11176                """
11177
11178                # Insert data
11179                query_insert_chunk = f"""
11180                    INSERT INTO {transcripts_table}
11181                    {query_chunk}
11182                """
11183                self.execute_query(query=query_insert_chunk)
11184
11185            # Remove temporary tables
11186            if temporary_tables:
11187                for temporary_table in list(set(temporary_tables)):
11188                    query_drop_tmp_table = f"""
11189                        DROP TABLE IF EXISTS {temporary_table}
11190                    """
11191                    self.execute_query(query=query_drop_tmp_table)
11192
11193            # Remove added columns
11194            for added_column in added_columns:
11195                self.drop_column(column=added_column)
11196
11197        else:
11198
11199            transcripts_table = None
11200
11201        return transcripts_table
11202
11203    def annotation_format_to_table(
11204        self,
11205        uniquify: bool = True,
11206        annotation_field: str = "ANN",
11207        annotation_id: str = "Feature_ID",
11208        view_name: str = "transcripts",
11209        column_rename: dict = {},
11210        column_clean: bool = False,
11211        column_case: str = None,
11212    ) -> str:
11213        """
11214        The `annotation_format_to_table` function converts annotation data from a VCF file into a
11215        structured table format, ensuring unique values and creating a temporary table for further
11216        processing or analysis.
11217
11218        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure
11219        unique values in the output or not. If set to `True`, the function will make sure that the
11220        output values are unique, defaults to True
11221        :type uniquify: bool (optional)
11222        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file
11223        that contains the annotation information for each variant. This field is used to extract the
11224        annotation details for further processing in the function. By default, it is set to "ANN",
11225        defaults to ANN
11226        :type annotation_field: str (optional)
11227        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method
11228        is used to specify the identifier for the annotation feature. This identifier will be used as a
11229        column name in the resulting table or view that is created based on the annotation data. It
11230        helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
11231        :type annotation_id: str (optional)
11232        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used
11233        to specify the name of the temporary table that will be created to store the transformed
11234        annotation data. This table will hold the extracted information from the annotation field in a
11235        structured format for further processing or analysis. By default,, defaults to transcripts
11236        :type view_name: str (optional)
11237        :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method
11238        is a dictionary that allows you to specify custom renaming for columns. By providing key-value
11239        pairs in this dictionary, you can rename specific columns in the resulting table or view that is
11240        created based on the annotation data. This feature enables
11241        :type column_rename: dict
11242        :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is
11243        a boolean flag that determines whether the annotation field should undergo a cleaning process.
11244        If set to `True`, the function will clean the annotation field before further processing. This
11245        cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults
11246        to False
11247        :type column_clean: bool (optional)
11248        :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is
11249        used to specify the case transformation to be applied to the column names extracted from the
11250        annotation data. It allows you to set the case of the column names to either lowercase or
11251        uppercase for consistency or other specific requirements during the conversion
11252        :type column_case: str
11253        :return: The function `annotation_format_to_table` is returning the name of the view created,
11254        which is stored in the variable `view_name`.
11255        """
11256
11257        # Annotation field
11258        annotation_format = "annotation_explode"
11259
11260        # Transcript annotation
11261        if column_rename:
11262            annotation_id = column_rename.get(annotation_id, annotation_id)
11263
11264        if column_clean:
11265            annotation_id = clean_annotation_field(annotation_id)
11266
11267        # Prefix
11268        prefix = self.get_explode_infos_prefix()
11269        if prefix:
11270            prefix = "INFO/"
11271
11272        # Annotation fields
11273        annotation_infos = prefix + annotation_field
11274        annotation_format_infos = prefix + annotation_format
11275
11276        # Variants table
11277        table_variants = self.get_table_variants()
11278
11279        # Header
11280        vcf_reader = self.get_header()
11281
11282        # Add columns
11283        added_columns = []
11284
11285        # Explode HGVS field in column
11286        added_columns += self.explode_infos(fields=[annotation_field])
11287
11288        if annotation_field in vcf_reader.infos:
11289
11290            # Extract ANN header
11291            ann_description = vcf_reader.infos[annotation_field].desc
11292            pattern = r"'(.+?)'"
11293            match = re.search(pattern, ann_description)
11294            if match:
11295                ann_header_match = match.group(1).split(" | ")
11296                ann_header = []
11297                ann_header_desc = {}
11298                for i in range(len(ann_header_match)):
11299                    ann_header_info = "".join(
11300                        char for char in ann_header_match[i] if char.isalnum()
11301                    )
11302                    ann_header.append(ann_header_info)
11303                    ann_header_desc[ann_header_info] = ann_header_match[i]
11304                if not ann_header_desc:
11305                    raise ValueError("Invalid header description format")
11306            else:
11307                raise ValueError("Invalid header description format")
11308
11309            # Create variant id
11310            variant_id_column = self.get_variant_id_column()
11311            added_columns += [variant_id_column]
11312
11313            # Get list of #CHROM
11314            query_unique_chrom = f"""
11315                SELECT DISTINCT "#CHROM"
11316                FROM variants AS subquery
11317            """
11318            unique_chroms = self.get_query_to_df(query=query_unique_chrom)
11319
11320            # Base for database anontation format
11321            dataframe_annotation_format_base = f"""
11322                SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}"
11323                FROM {table_variants}
11324            """
11325
11326            # Create dataframe for keys column type
11327            dataframe_annotation_format = self.get_query_to_df(
11328                f""" {dataframe_annotation_format_base} LIMIT 1000 """
11329            )
11330
11331            # Define a vectorized function to apply explode_annotation_format
11332            vectorized_explode_annotation_format = np.vectorize(
11333                lambda x: explode_annotation_format(
11334                    annotation=str(x),
11335                    uniquify=uniquify,
11336                    output_format="JSON",
11337                    prefix="",
11338                    header=list(ann_header_desc.values()),
11339                )
11340            )
11341
11342            # Assign the exploded annotations back to the dataframe
11343            dataframe_annotation_format[annotation_format_infos] = (
11344                vectorized_explode_annotation_format(
11345                    dataframe_annotation_format[annotation_infos].to_numpy()
11346                )
11347            )
11348
11349            # Find keys
11350            query_json = f"""
11351                SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key'
11352                FROM dataframe_annotation_format;
11353            """
11354            df_keys = self.get_query_to_df(query=query_json)
11355
11356            # Check keys
11357            query_json_key = []
11358            for _, row in df_keys.iterrows():
11359
11360                # Key
11361                key = row.iloc[0]
11362                key_clean = key
11363
11364                # key rename
11365                if column_rename:
11366                    key_clean = column_rename.get(key_clean, key_clean)
11367
11368                # key clean
11369                if column_clean:
11370                    key_clean = clean_annotation_field(key_clean)
11371
11372                # Key case
11373                if column_case:
11374                    if column_case.lower() in ["lower"]:
11375                        key_clean = key_clean.lower()
11376                    elif column_case.lower() in ["upper"]:
11377                        key_clean = key_clean.upper()
11378
11379                # Type
11380                query_json_type = f"""
11381                    SELECT * 
11382                    FROM (
11383                        SELECT 
11384                            NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '') AS '{key_clean}'
11385                        FROM
11386                            dataframe_annotation_format
11387                        )
11388                    WHERE "{key_clean}" NOT NULL AND "{key_clean}" NOT IN ('')
11389                """
11390
11391                # Get DataFrame from query
11392                df_json_type = self.get_query_to_df(query=query_json_type)
11393
11394                # Detect column type
11395                column_type = detect_column_type(df_json_type[key_clean])
11396
11397                # Free up memory
11398                del df_json_type
11399
11400                # Append
11401                query_json_key.append(
11402                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
11403                )
11404
11405            # Create table with structure but without data, if not exists
11406            query_create_table = f"""
11407                CREATE TABLE IF NOT EXISTS {view_name}
11408                AS (
11409                    SELECT *, {annotation_id} AS 'transcript'
11410                    FROM (
11411                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
11412                        FROM dataframe_annotation_format
11413                        )
11414                    LIMIT 0
11415                    );
11416            """
11417            self.execute_query(query=query_create_table)
11418
11419            # Free up memory
11420            del dataframe_annotation_format
11421
11422            # Insert data by chromosome
11423            for chrom in unique_chroms["#CHROM"]:
11424
11425                # Log
11426                log.debug(f"Processing #CHROM={chrom}")
11427
11428                # Create dataframe
11429                dataframe_annotation_format = self.get_query_to_df(
11430                    f""" {dataframe_annotation_format_base}  WHERE "#CHROM" = '{chrom}' """
11431                )
11432
11433                # Define a vectorized function to apply explode_annotation_format
11434                vectorized_explode_annotation_format = np.vectorize(
11435                    lambda x: explode_annotation_format(
11436                        annotation=str(x),
11437                        uniquify=uniquify,
11438                        output_format="JSON",
11439                        prefix="",
11440                        header=list(ann_header_desc.values()),
11441                    )
11442                )
11443
11444                # Assign the exploded annotations back to the dataframe
11445                dataframe_annotation_format[annotation_format_infos] = (
11446                    vectorized_explode_annotation_format(
11447                        dataframe_annotation_format[annotation_infos].to_numpy()
11448                    )
11449                )
11450
11451                # Insert data into tmp table
11452                query_insert_chunk = f"""
11453                    INSERT INTO {view_name}
11454                    SELECT *, {annotation_id} AS 'transcript'
11455                    FROM (
11456                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
11457                        FROM dataframe_annotation_format
11458                        )
11459                """
11460                self.execute_query(query=query_insert_chunk)
11461
11462                # Free up memory
11463                del dataframe_annotation_format
11464
11465        else:
11466
11467            # Return None
11468            view_name = None
11469
11470        # Remove added columns
11471        for added_column in added_columns:
11472            self.drop_column(column=added_column)
11473
11474        return view_name
11475
11476    def transcript_view_to_variants(
11477        self,
11478        transcripts_table: str = None,
11479        transcripts_column_id: str = None,
11480        transcripts_info_json: str = None,
11481        transcripts_info_field_json: str = None,
11482        transcripts_info_format: str = None,
11483        transcripts_info_field_format: str = None,
11484        param: dict = {},
11485    ) -> bool:
11486        """
11487        The `transcript_view_to_variants` function updates a variants table with information from
11488        transcripts in JSON format.
11489
11490        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
11491        table containing the transcripts data. If this parameter is not provided, the function will
11492        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
11493        :type transcripts_table: str
11494        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
11495        column in the `transcripts_table` that contains the unique identifier for each transcript. This
11496        identifier is used to match transcripts with variants in the database
11497        :type transcripts_column_id: str
11498        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
11499        of the column in the variants table where the transcripts information will be stored in JSON
11500        format. This parameter allows you to define the column in the variants table that will hold the
11501        JSON-formatted information about transcripts
11502        :type transcripts_info_json: str
11503        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
11504        specify the field in the VCF header that will contain information about transcripts in JSON
11505        format. This field will be added to the VCF header as an INFO field with the specified name
11506        :type transcripts_info_field_json: str
11507        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
11508        format of the information about transcripts that will be stored in the variants table. This
11509        format can be used to define how the transcript information will be structured or displayed
11510        within the variants table
11511        :type transcripts_info_format: str
11512        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
11513        specify the field in the VCF header that will contain information about transcripts in a
11514        specific format. This field will be added to the VCF header as an INFO field with the specified
11515        name
11516        :type transcripts_info_field_format: str
11517        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
11518        that contains various configuration settings related to transcripts. It is used to provide
11519        default values for certain parameters if they are not explicitly provided when calling the
11520        method. The `param` dictionary can be passed as an argument
11521        :type param: dict
11522        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
11523        if the operation is successful and `False` if certain conditions are not met.
11524        """
11525
11526        msg_info_prefix = "Start transcripts view to variants annotations"
11527
11528        log.debug(f"{msg_info_prefix}...")
11529
11530        # Default
11531        transcripts_table_default = "transcripts"
11532        transcripts_column_id_default = "transcript"
11533        transcripts_info_json_default = None
11534        transcripts_info_format_default = None
11535        transcripts_info_field_json_default = None
11536        transcripts_info_field_format_default = None
11537
11538        # Param
11539        if not param:
11540            param = self.get_param()
11541
11542        # Transcripts table
11543        if transcripts_table is None:
11544            transcripts_table = param.get("transcripts", {}).get(
11545                "table", transcripts_table_default
11546            )
11547
11548        # Transcripts column ID
11549        if transcripts_column_id is None:
11550            transcripts_column_id = param.get("transcripts", {}).get(
11551                "column_id", transcripts_column_id_default
11552            )
11553
11554        # Transcripts info json
11555        if transcripts_info_json is None:
11556            transcripts_info_json = param.get("transcripts", {}).get(
11557                "transcripts_info_json", transcripts_info_json_default
11558            )
11559
11560        # Transcripts info field JSON
11561        if transcripts_info_field_json is None:
11562            transcripts_info_field_json = param.get("transcripts", {}).get(
11563                "transcripts_info_field_json", transcripts_info_field_json_default
11564            )
11565        # if transcripts_info_field_json is not None and transcripts_info_json is None:
11566        #     transcripts_info_json = transcripts_info_field_json
11567
11568        # Transcripts info format
11569        if transcripts_info_format is None:
11570            transcripts_info_format = param.get("transcripts", {}).get(
11571                "transcripts_info_format", transcripts_info_format_default
11572            )
11573
11574        # Transcripts info field FORMAT
11575        if transcripts_info_field_format is None:
11576            transcripts_info_field_format = param.get("transcripts", {}).get(
11577                "transcripts_info_field_format", transcripts_info_field_format_default
11578            )
11579        # if (
11580        #     transcripts_info_field_format is not None
11581        #     and transcripts_info_format is None
11582        # ):
11583        #     transcripts_info_format = transcripts_info_field_format
11584
11585        # Variants table
11586        table_variants = self.get_table_variants()
11587
11588        # Check info columns param
11589        if (
11590            transcripts_info_json is None
11591            and transcripts_info_field_json is None
11592            and transcripts_info_format is None
11593            and transcripts_info_field_format is None
11594        ):
11595            return False
11596
11597        # Transcripts infos columns
11598        query_transcripts_infos_columns = f"""
11599            SELECT *
11600            FROM (
11601                DESCRIBE SELECT * FROM {transcripts_table}
11602                )
11603            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
11604        """
11605        transcripts_infos_columns = list(
11606            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
11607        )
11608
11609        # View results
11610        clause_select = []
11611        clause_to_json = []
11612        clause_to_format = []
11613        for field in transcripts_infos_columns:
11614            # Do not consider INFO field for export into fields
11615            if field not in ["INFO"]:
11616                clause_select.append(
11617                    f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """
11618                )
11619                clause_to_json.append(f""" '{field}': "{field}" """)
11620                clause_to_format.append(f""" "{field}" """)
11621
11622        # Update
11623        update_set_json = []
11624        update_set_format = []
11625
11626        # VCF header
11627        vcf_reader = self.get_header()
11628
11629        # Transcripts to info column in JSON
11630        if transcripts_info_json:
11631
11632            # Create column on variants table
11633            self.add_column(
11634                table_name=table_variants,
11635                column_name=transcripts_info_json,
11636                column_type="JSON",
11637                default_value=None,
11638                drop=False,
11639            )
11640
11641            # Add header
11642            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
11643                transcripts_info_json,
11644                ".",
11645                "String",
11646                "Transcripts in JSON format",
11647                "unknwon",
11648                "unknwon",
11649                self.code_type_map["String"],
11650            )
11651
11652            # Add to update
11653            update_set_json.append(
11654                f""" {transcripts_info_json}=t.{transcripts_info_json} """
11655            )
11656
11657        # Transcripts to info field in JSON
11658        if transcripts_info_field_json:
11659
11660            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
11661
11662            # Add to update
11663            update_set_json.append(
11664                f""" 
11665                    INFO = concat(
11666                            CASE
11667                                WHEN INFO NOT IN ('', '.')
11668                                THEN INFO
11669                                ELSE ''
11670                            END,
11671                            CASE
11672                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
11673                                THEN concat(
11674                                    ';{transcripts_info_field_json}=',
11675                                    t.{transcripts_info_json}
11676                                )
11677                                ELSE ''
11678                            END
11679                            )
11680                """
11681            )
11682
11683            # Add header
11684            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
11685                transcripts_info_field_json,
11686                ".",
11687                "String",
11688                "Transcripts in JSON format",
11689                "unknwon",
11690                "unknwon",
11691                self.code_type_map["String"],
11692            )
11693
11694        if update_set_json:
11695
11696            # Update query
11697            query_update = f"""
11698                UPDATE {table_variants}
11699                    SET {", ".join(update_set_json)}
11700                FROM
11701                (
11702                    SELECT
11703                        "#CHROM", POS, REF, ALT,
11704                            concat(
11705                            '{{',
11706                            string_agg(
11707                                '"' || "{transcripts_column_id}" || '":' ||
11708                                to_json(json_output)
11709                            ),
11710                            '}}'
11711                            )::JSON AS {transcripts_info_json}
11712                    FROM
11713                        (
11714                        SELECT
11715                            "#CHROM", POS, REF, ALT,
11716                            "{transcripts_column_id}",
11717                            to_json(
11718                                {{{",".join(clause_to_json)}}}
11719                            )::JSON AS json_output
11720                        FROM
11721                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11722                        WHERE "{transcripts_column_id}" IS NOT NULL
11723                        )
11724                    GROUP BY "#CHROM", POS, REF, ALT
11725                ) AS t
11726                WHERE {table_variants}."#CHROM" = t."#CHROM"
11727                    AND {table_variants}."POS" = t."POS"
11728                    AND {table_variants}."REF" = t."REF"
11729                    AND {table_variants}."ALT" = t."ALT"
11730            """
11731
11732            self.execute_query(query=query_update)
11733
11734        # Transcripts to info column in FORMAT
11735        if transcripts_info_format:
11736
11737            # Create column on variants table
11738            self.add_column(
11739                table_name=table_variants,
11740                column_name=transcripts_info_format,
11741                column_type="VARCHAR",
11742                default_value=None,
11743                drop=False,
11744            )
11745
11746            # Add header
11747            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
11748                transcripts_info_format,
11749                ".",
11750                "String",
11751                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11752                "unknwon",
11753                "unknwon",
11754                self.code_type_map["String"],
11755            )
11756
11757            # Add to update
11758            update_set_format.append(
11759                f""" {transcripts_info_format}=t.{transcripts_info_format} """
11760            )
11761
11762        else:
11763
11764            # Set variable for internal queries
11765            transcripts_info_format = "transcripts_info_format"
11766
11767        # Transcripts to info field in JSON
11768        if transcripts_info_field_format:
11769
11770            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
11771
11772            # Add to update
11773            update_set_format.append(
11774                f""" 
11775                    INFO = concat(
11776                            CASE
11777                                WHEN INFO NOT IN ('', '.')
11778                                THEN INFO
11779                                ELSE ''
11780                            END,
11781                            CASE
11782                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
11783                                THEN concat(
11784                                    ';{transcripts_info_field_format}=',
11785                                    t.{transcripts_info_format}
11786                                )
11787                                ELSE ''
11788                            END
11789                            )
11790                """
11791            )
11792
11793            # Add header
11794            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
11795                transcripts_info_field_format,
11796                ".",
11797                "String",
11798                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11799                "unknwon",
11800                "unknwon",
11801                self.code_type_map["String"],
11802            )
11803
11804        if update_set_format:
11805
11806            # Update query
11807            query_update = f"""
11808                UPDATE {table_variants}
11809                    SET {", ".join(update_set_format)}
11810                FROM
11811                (
11812                    SELECT
11813                        "#CHROM", POS, REF, ALT,
11814                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
11815                    FROM 
11816                        (
11817                        SELECT
11818                            "#CHROM", POS, REF, ALT,
11819                            "{transcripts_column_id}",
11820                            concat(
11821                                "{transcripts_column_id}",
11822                                '|',
11823                                {", '|', ".join(clause_to_format)}
11824                            ) AS {transcripts_info_format}
11825                        FROM
11826                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11827                        )
11828                    GROUP BY "#CHROM", POS, REF, ALT
11829                ) AS t
11830                WHERE {table_variants}."#CHROM" = t."#CHROM"
11831                    AND {table_variants}."POS" = t."POS"
11832                    AND {table_variants}."REF" = t."REF"
11833                    AND {table_variants}."ALT" = t."ALT"
11834            """
11835
11836            self.execute_query(query=query_update)
11837
11838        return True
11839
11840    def rename_info_fields(
11841        self, fields_to_rename: dict = None, table: str = None
11842    ) -> dict:
11843        """
11844        The `rename_info_fields` function renames specified fields in a VCF file header and updates
11845        corresponding INFO fields in the variants table.
11846
11847        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the
11848        mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary
11849        represent the original field names that need to be renamed, and the corresponding values
11850        represent the new names to which the fields should be
11851        :type fields_to_rename: dict
11852        :param table: The `table` parameter in the `rename_info_fields` function represents the name of
11853        the table in which the variants data is stored. This table contains information about genetic
11854        variants, and the function updates the corresponding INFO fields in this table when renaming
11855        specified fields in the VCF file header
11856        :type table: str
11857        :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains
11858        the original field names as keys and their corresponding new names (or None if the field was
11859        removed) as values after renaming or removing specified fields in a VCF file header and updating
11860        corresponding INFO fields in the variants table.
11861        """
11862
11863        # Init
11864        fields_renamed = {}
11865        config = self.get_config()
11866        access = config.get("access")
11867
11868        if table is None:
11869            table = self.get_table_variants()
11870
11871        # regexp replace fonction
11872        regex_replace_dict = {}
11873        regex_replace_nb = 0
11874        regex_replace_partition = 125
11875        regex_replace = "concat(INFO, ';')"  # Add ';' to reduce regexp comlexity
11876
11877        if fields_to_rename is not None and access not in ["RO"]:
11878
11879            log.info("Rename or remove fields...")
11880
11881            # Header
11882            header = self.get_header()
11883
11884            for field_to_rename, field_renamed in fields_to_rename.items():
11885
11886                if field_to_rename in header.infos:
11887
11888                    # Rename header
11889                    if field_renamed is not None:
11890                        header.infos[field_renamed] = vcf.parser._Info(
11891                            field_renamed,
11892                            header.infos[field_to_rename].num,
11893                            header.infos[field_to_rename].type,
11894                            header.infos[field_to_rename].desc,
11895                            header.infos[field_to_rename].source,
11896                            header.infos[field_to_rename].version,
11897                            header.infos[field_to_rename].type_code,
11898                        )
11899                    del header.infos[field_to_rename]
11900
11901                    # Rename INFO patterns
11902                    field_pattern = rf"(^|;)({field_to_rename})(=[^;]*)?;"
11903                    if field_renamed is not None:
11904                        field_renamed_pattern = rf"\1{field_renamed}\3;"
11905                    else:
11906                        field_renamed_pattern = r"\1"
11907
11908                    # regexp replace
11909                    regex_replace_nb += 1
11910                    regex_replace_key = math.floor(
11911                        regex_replace_nb / regex_replace_partition
11912                    )
11913                    if (regex_replace_nb % regex_replace_partition) == 0:
11914                        regex_replace = "concat(INFO, ';')"
11915                    regex_replace = f"regexp_replace({regex_replace}, '{field_pattern}', '{field_renamed_pattern}')"
11916                    regex_replace_dict[regex_replace_key] = regex_replace
11917
11918                    # Return
11919                    fields_renamed[field_to_rename] = field_renamed
11920
11921                    # Log
11922                    if field_renamed is not None:
11923                        log.info(
11924                            f"Rename or remove fields - field '{field_to_rename}' renamed to '{field_renamed}'"
11925                        )
11926                    else:
11927                        log.info(
11928                            f"Rename or remove fields - field '{field_to_rename}' removed"
11929                        )
11930
11931                else:
11932
11933                    log.warning(
11934                        f"Rename or remove fields - field '{field_to_rename}' not in header"
11935                    )
11936
11937            # Rename INFO
11938            for regex_replace_key, regex_replace in regex_replace_dict.items():
11939                log.info(
11940                    f"Rename or remove fields - Process [{regex_replace_key+1}/{len(regex_replace_dict)}]..."
11941                )
11942                query = f"""
11943                    UPDATE {table}
11944                    SET
11945                        INFO = regexp_replace({regex_replace}, ';$', '')
11946                """
11947                log.debug(f"query={query}")
11948                self.execute_query(query=query)
11949
11950        return fields_renamed
11951
11952    def calculation_rename_info_fields(
11953        self,
11954        fields_to_rename: dict = None,
11955        table: str = None,
11956        operation_name: str = "RENAME_INFO_FIELDS",
11957    ) -> None:
11958        """
11959        The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates
11960        fields to rename and table if provided, and then calls another function to rename the fields.
11961
11962        :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be
11963        renamed in a table. Each key-value pair in the dictionary represents the original field name as
11964        the key and the new field name as the value
11965        :type fields_to_rename: dict
11966        :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to
11967        specify the name of the table for which the fields are to be renamed. It is a string type
11968        parameter
11969        :type table: str
11970        :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields`
11971        method is a string that specifies the name of the operation being performed. In this context, it
11972        is used as a default value for the operation name if not explicitly provided when calling the
11973        function, defaults to RENAME_INFO_FIELDS
11974        :type operation_name: str (optional)
11975        """
11976
11977        # Param
11978        param = self.get_param()
11979
11980        # Get param fields to rename
11981        param_fields_to_rename = (
11982            param.get("calculation", {})
11983            .get("calculations", {})
11984            .get(operation_name, {})
11985            .get("fields_to_rename", None)
11986        )
11987
11988        # Get param table
11989        param_table = (
11990            param.get("calculation", {})
11991            .get("calculations", {})
11992            .get(operation_name, {})
11993            .get("table", None)
11994        )
11995
11996        # Init fields_to_rename
11997        if fields_to_rename is None:
11998            fields_to_rename = param_fields_to_rename
11999
12000        # Init table
12001        if table is None:
12002            table = param_table
12003
12004        renamed_fields = self.rename_info_fields(
12005            fields_to_rename=fields_to_rename, table=table
12006        )
12007
12008        log.debug(f"renamed_fields:{renamed_fields}")
Variants( conn=None, input: str = None, output: str = None, config: dict = {}, param: dict = {}, load: bool = False)
39    def __init__(
40        self,
41        conn=None,
42        input: str = None,
43        output: str = None,
44        config: dict = {},
45        param: dict = {},
46        load: bool = False,
47    ) -> None:
48        """
49        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
50        header
51
52        :param conn: the connection to the database
53        :param input: the input file
54        :param output: the output file
55        :param config: a dictionary containing the configuration of the model
56        :param param: a dictionary containing the parameters of the model
57        """
58
59        # Init variables
60        self.init_variables()
61
62        # Input
63        self.set_input(input)
64
65        # Config
66        self.set_config(config)
67
68        # Param
69        self.set_param(param)
70
71        # Output
72        self.set_output(output)
73
74        # connexion
75        self.set_connexion(conn)
76
77        # Header
78        self.set_header()
79
80        # Samples
81        self.set_samples()
82
83        # Load data
84        if load:
85            self.load_data()

The function __init__ initializes the variables, sets the input, output, config, param, connexion and header

Parameters
  • conn: the connection to the database
  • input: the input file
  • output: the output file
  • config: a dictionary containing the configuration of the model
  • param: a dictionary containing the parameters of the model
def set_samples(self, samples: list = None) -> list:
 87    def set_samples(self, samples: list = None) -> list:
 88        """
 89        The function `set_samples` sets the samples attribute of an object to a provided list or
 90        retrieves it from a parameter dictionary.
 91
 92        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
 93        input and sets the `samples` attribute of the class to the provided list. If no samples are
 94        provided, it tries to get the samples from the class's parameters using the `get_param` method
 95        :type samples: list
 96        :return: The `samples` list is being returned.
 97        """
 98
 99        if not samples:
100            samples = self.get_param().get("samples", {}).get("list", None)
101
102        self.samples = samples
103
104        return samples

The function set_samples sets the samples attribute of an object to a provided list or retrieves it from a parameter dictionary.

Parameters
  • samples: The set_samples method is a method of a class that takes a list of samples as input and sets the samples attribute of the class to the provided list. If no samples are provided, it tries to get the samples from the class's parameters using the get_param method
Returns

The samples list is being returned.

def get_samples(self) -> list:
106    def get_samples(self) -> list:
107        """
108        This function returns a list of samples.
109        :return: The `get_samples` method is returning the `samples` attribute of the object.
110        """
111
112        return self.samples

This function returns a list of samples.

Returns

The get_samples method is returning the samples attribute of the object.

def get_samples_check(self) -> bool:
114    def get_samples_check(self) -> bool:
115        """
116        This function returns the value of the "check" key within the "samples" dictionary retrieved
117        from the parameters.
118        :return: The method `get_samples_check` is returning the value of the key "check" inside the
119        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
120        method. If the key "check" is not found, it will return `False`.
121        """
122
123        return self.get_param().get("samples", {}).get("check", True)

This function returns the value of the "check" key within the "samples" dictionary retrieved from the parameters.

Returns

The method get_samples_check is returning the value of the key "check" inside the "samples" dictionary, which is nested inside the dictionary returned by the get_param() method. If the key "check" is not found, it will return False.

def set_input(self, input: str = None) -> None:
125    def set_input(self, input: str = None) -> None:
126        """
127        The function `set_input` takes a file name as input, extracts the name and extension, and sets
128        attributes in the class accordingly.
129
130        :param input: The `set_input` method in the provided code snippet is used to set attributes
131        related to the input file. Here's a breakdown of the parameters and their usage in the method:
132        :type input: str
133        """
134
135        if input and not isinstance(input, str):
136            try:
137                self.input = input.name
138            except:
139                log.error(f"Input file '{input} in bad format")
140                raise ValueError(f"Input file '{input} in bad format")
141        else:
142            self.input = input
143
144        # Input format
145        if input:
146            input_name, input_extension = os.path.splitext(self.input)
147            self.input_name = input_name
148            self.input_extension = input_extension
149            self.input_format = self.input_extension.replace(".", "")

The function set_input takes a file name as input, extracts the name and extension, and sets attributes in the class accordingly.

Parameters
  • input: The set_input method in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
def set_config(self, config: dict) -> None:
151    def set_config(self, config: dict) -> None:
152        """
153        The set_config function takes a config object and assigns it as the configuration object for the
154        class.
155
156        :param config: The `config` parameter in the `set_config` function is a dictionary object that
157        contains configuration settings for the class. When you call the `set_config` function with a
158        dictionary object as the argument, it will set that dictionary as the configuration object for
159        the class
160        :type config: dict
161        """
162
163        self.config = config

The set_config function takes a config object and assigns it as the configuration object for the class.

Parameters
  • config: The config parameter in the set_config function is a dictionary object that contains configuration settings for the class. When you call the set_config function with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
def set_param(self, param: dict) -> None:
165    def set_param(self, param: dict) -> None:
166        """
167        This function sets a parameter object for the class based on the input dictionary.
168
169        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
170        as the `param` attribute of the class instance
171        :type param: dict
172        """
173
174        self.param = param

This function sets a parameter object for the class based on the input dictionary.

Parameters
  • param: The set_param method you provided takes a dictionary object as input and sets it as the param attribute of the class instance
def init_variables(self) -> None:
176    def init_variables(self) -> None:
177        """
178        This function initializes the variables that will be used in the rest of the class
179        """
180
181        self.prefix = "howard"
182        self.table_variants = "variants"
183        self.dataframe = None
184
185        self.comparison_map = {
186            "gt": ">",
187            "gte": ">=",
188            "lt": "<",
189            "lte": "<=",
190            "equals": "=",
191            "contains": "SIMILAR TO",
192        }
193
194        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
195
196        self.code_type_map_to_sql = {
197            "Integer": "INTEGER",
198            "String": "VARCHAR",
199            "Float": "FLOAT",
200            "Flag": "VARCHAR",
201        }
202
203        self.index_additionnal_fields = []

This function initializes the variables that will be used in the rest of the class

def get_indexing(self) -> bool:
205    def get_indexing(self) -> bool:
206        """
207        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
208        returns False.
209        :return: The value of the indexing parameter.
210        """
211
212        return self.get_param().get("indexing", False)

It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.

Returns

The value of the indexing parameter.

def get_connexion_config(self) -> dict:
214    def get_connexion_config(self) -> dict:
215        """
216        The function `get_connexion_config` returns a dictionary containing the configuration for a
217        connection, including the number of threads and memory limit.
218        :return: a dictionary containing the configuration for the Connexion library.
219        """
220
221        # config
222        config = self.get_config()
223
224        # Connexion config
225        connexion_config = {}
226        threads = self.get_threads()
227
228        # Threads
229        if threads:
230            connexion_config["threads"] = threads
231
232        # Memory
233        # if config.get("memory", None):
234        #     connexion_config["memory_limit"] = config.get("memory")
235        if self.get_memory():
236            connexion_config["memory_limit"] = self.get_memory()
237
238        # Temporary directory
239        if config.get("tmp", None):
240            connexion_config["temp_directory"] = config.get("tmp")
241
242        # Access
243        if config.get("access", None):
244            access = config.get("access")
245            if access in ["RO"]:
246                access = "READ_ONLY"
247            elif access in ["RW"]:
248                access = "READ_WRITE"
249            connexion_db = self.get_connexion_db()
250            if connexion_db in ":memory:":
251                access = "READ_WRITE"
252            connexion_config["access_mode"] = access
253
254        return connexion_config

The function get_connexion_config returns a dictionary containing the configuration for a connection, including the number of threads and memory limit.

Returns

a dictionary containing the configuration for the Connexion library.

def get_duckdb_settings(self) -> dict:
256    def get_duckdb_settings(self) -> dict:
257        """
258        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
259        string.
260        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
261        """
262
263        # config
264        config = self.get_config()
265
266        # duckdb settings
267        duckdb_settings_dict = {}
268        if config.get("duckdb_settings", None):
269            duckdb_settings = config.get("duckdb_settings")
270            duckdb_settings = full_path(duckdb_settings)
271            # duckdb setting is a file
272            if os.path.exists(duckdb_settings):
273                with open(duckdb_settings) as json_file:
274                    duckdb_settings_dict = yaml.safe_load(json_file)
275            # duckdb settings is a string
276            else:
277                duckdb_settings_dict = json.loads(duckdb_settings)
278
279        return duckdb_settings_dict

The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a string.

Returns

The function get_duckdb_settings returns a dictionary object duckdb_settings_dict.

def set_connexion_db(self) -> str:
281    def set_connexion_db(self) -> str:
282        """
283        The function `set_connexion_db` returns the appropriate database connection string based on the
284        input format and connection type.
285        :return: the value of the variable `connexion_db`.
286        """
287
288        # Default connexion db
289        default_connexion_db = ":memory:"
290
291        # Find connexion db
292        if self.get_input_format() in ["db", "duckdb"]:
293            connexion_db = self.get_input()
294        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
295            connexion_db = default_connexion_db
296        elif self.get_connexion_type() in ["tmpfile"]:
297            tmp_name = tempfile.mkdtemp(
298                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
299            )
300            connexion_db = f"{tmp_name}/tmp.db"
301        elif self.get_connexion_type() != "":
302            connexion_db = self.get_connexion_type()
303        else:
304            connexion_db = default_connexion_db
305
306        # Set connexion db
307        self.connexion_db = connexion_db
308
309        return connexion_db

The function set_connexion_db returns the appropriate database connection string based on the input format and connection type.

Returns

the value of the variable connexion_db.

def set_connexion(self, conn) -> None:
311    def set_connexion(self, conn) -> None:
312        """
313        The function `set_connexion` creates a connection to a database, with options for different
314        database formats and settings.
315
316        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
317        database. If a connection is not provided, a new connection to an in-memory database is created.
318        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
319        sqlite
320        """
321
322        # Connexion db
323        connexion_db = self.set_connexion_db()
324
325        # Connexion config
326        connexion_config = self.get_connexion_config()
327
328        # Connexion format
329        connexion_format = self.get_config().get("connexion_format", "duckdb")
330        # Set connexion format
331        self.connexion_format = connexion_format
332
333        # Connexion
334        if not conn:
335            if connexion_format in ["duckdb"]:
336                conn = duckdb.connect(connexion_db, config=connexion_config)
337                # duckDB settings
338                duckdb_settings = self.get_duckdb_settings()
339                if duckdb_settings:
340                    for setting in duckdb_settings:
341                        setting_value = duckdb_settings.get(setting)
342                        if isinstance(setting_value, str):
343                            setting_value = f"'{setting_value}'"
344                        conn.execute(f"PRAGMA {setting}={setting_value};")
345            elif connexion_format in ["sqlite"]:
346                conn = sqlite3.connect(connexion_db)
347
348        # Set connexion
349        self.conn = conn
350
351        # Log
352        log.debug(f"connexion_format: {connexion_format}")
353        log.debug(f"connexion_db: {connexion_db}")
354        log.debug(f"connexion config: {connexion_config}")
355        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")

The function set_connexion creates a connection to a database, with options for different database formats and settings.

Parameters
  • conn: The conn parameter in the set_connexion method is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
def set_output(self, output: str = None) -> None:
357    def set_output(self, output: str = None) -> None:
358        """
359        The `set_output` function in Python sets the output file based on the input or a specified key
360        in the config file, extracting the output name, extension, and format.
361
362        :param output: The `output` parameter in the `set_output` method is used to specify the name of
363        the output file. If the config file has an 'output' key, the method sets the output to the value
364        of that key. If no output is provided, it sets the output to `None`
365        :type output: str
366        """
367
368        if output and not isinstance(output, str):
369            self.output = output.name
370        else:
371            self.output = output
372
373        # Output format
374        if self.output:
375            output_name, output_extension = os.path.splitext(self.output)
376            self.output_name = output_name
377            self.output_extension = output_extension
378            self.output_format = self.output_extension.replace(".", "")
379        else:
380            self.output_name = None
381            self.output_extension = None
382            self.output_format = None

The set_output function in Python sets the output file based on the input or a specified key in the config file, extracting the output name, extension, and format.

Parameters
  • output: The output parameter in the set_output method is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output to None
def set_header(self) -> None:
384    def set_header(self) -> None:
385        """
386        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
387        """
388
389        input_file = self.get_input()
390        default_header_list = [
391            "##fileformat=VCFv4.2",
392            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
393        ]
394
395        # Full path
396        input_file = full_path(input_file)
397
398        if input_file:
399
400            input_format = self.get_input_format()
401            input_compressed = self.get_input_compressed()
402            config = self.get_config()
403            header_list = default_header_list
404            if input_format in [
405                "vcf",
406                "hdr",
407                "tsv",
408                "csv",
409                "psv",
410                "parquet",
411                "db",
412                "duckdb",
413            ]:
414                # header provided in param
415                if config.get("header_file", None):
416                    with open(config.get("header_file"), "rt") as f:
417                        header_list = self.read_vcf_header(f)
418                # within a vcf file format (header within input file itsself)
419                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
420                    # within a compressed vcf file format (.vcf.gz)
421                    if input_compressed:
422                        with bgzf.open(input_file, "rt") as f:
423                            header_list = self.read_vcf_header(f)
424                    # within an uncompressed vcf file format (.vcf)
425                    else:
426                        with open(input_file, "rt") as f:
427                            header_list = self.read_vcf_header(f)
428                # header provided in default external file .hdr
429                elif os.path.exists((input_file + ".hdr")):
430                    with open(input_file + ".hdr", "rt") as f:
431                        header_list = self.read_vcf_header(f)
432                else:
433                    try:  # Try to get header info fields and file columns
434
435                        with tempfile.TemporaryDirectory() as tmpdir:
436
437                            # Create database
438                            db_for_header = Database(database=input_file)
439
440                            # Get header columns for infos fields
441                            db_header_from_columns = (
442                                db_for_header.get_header_from_columns()
443                            )
444
445                            # Get real columns in the file
446                            db_header_columns = db_for_header.get_columns()
447
448                            # Write header file
449                            header_file_tmp = os.path.join(tmpdir, "header")
450                            f = open(header_file_tmp, "w")
451                            vcf.Writer(f, db_header_from_columns)
452                            f.close()
453
454                            # Replace #CHROM line with rel columns
455                            header_list = db_for_header.read_header_file(
456                                header_file=header_file_tmp
457                            )
458                            header_list[-1] = "\t".join(db_header_columns)
459
460                    except:
461
462                        log.warning(
463                            f"No header for file {input_file}. Set as default VCF header"
464                        )
465                        header_list = default_header_list
466
467            else:  # try for unknown format ?
468
469                log.error(f"Input file format '{input_format}' not available")
470                raise ValueError(f"Input file format '{input_format}' not available")
471
472            if not header_list:
473                header_list = default_header_list
474
475            # header as list
476            self.header_list = header_list
477
478            # header as VCF object
479            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
480
481        else:
482
483            self.header_list = None
484            self.header_vcf = None

It reads the header of a VCF file and stores it as a list of strings and as a VCF object

def get_query_to_df(self, query: str = '', limit: int = None) -> pandas.core.frame.DataFrame:
486    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
487        """
488        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
489        DataFrame based on the connection format.
490
491        :param query: The `query` parameter in the `get_query_to_df` function is a string that
492        represents the SQL query you want to execute. This query will be used to fetch data from a
493        database and convert it into a pandas DataFrame
494        :type query: str
495        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
496        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
497        function will only fetch up to that number of rows from the database query result. If no limit
498        is specified,
499        :type limit: int
500        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
501        """
502
503        # Connexion format
504        connexion_format = self.get_connexion_format()
505
506        # Limit in query
507        if limit:
508            pd.set_option("display.max_rows", limit)
509            if connexion_format in ["duckdb"]:
510                df = (
511                    self.conn.execute(query)
512                    .fetch_record_batch(limit)
513                    .read_next_batch()
514                    .to_pandas()
515                )
516            elif connexion_format in ["sqlite"]:
517                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
518
519        # Full query
520        else:
521            if connexion_format in ["duckdb"]:
522                df = self.conn.execute(query).df()
523            elif connexion_format in ["sqlite"]:
524                df = pd.read_sql_query(query, self.conn)
525
526        return df

The get_query_to_df function takes a query as a string and returns the result as a pandas DataFrame based on the connection format.

Parameters
  • query: The query parameter in the get_query_to_df function is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame
  • limit: The limit parameter in the get_query_to_df function is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns

A pandas DataFrame is being returned by the get_query_to_df function.

def get_overview(self) -> None:
528    def get_overview(self) -> None:
529        """
530        The function prints the input, output, config, and dataframe of the current object
531        """
532        table_variants_from = self.get_table_variants(clause="from")
533        sql_columns = self.get_header_columns_as_sql()
534        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
535        df = self.get_query_to_df(sql_query_export)
536        log.info(
537            "Input:  "
538            + str(self.get_input())
539            + " ["
540            + str(str(self.get_input_format()))
541            + "]"
542        )
543        log.info(
544            "Output: "
545            + str(self.get_output())
546            + " ["
547            + str(str(self.get_output_format()))
548            + "]"
549        )
550        log.info("Config: ")
551        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
552            "\n"
553        ):
554            log.info("\t" + str(d))
555        log.info("Param: ")
556        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
557            "\n"
558        ):
559            log.info("\t" + str(d))
560        log.info("Sample list: " + str(self.get_header_sample_list()))
561        log.info("Dataframe: ")
562        for d in str(df).split("\n"):
563            log.info("\t" + str(d))
564
565        # garbage collector
566        del df
567        gc.collect()
568
569        return None

The function prints the input, output, config, and dataframe of the current object

def get_stats(self) -> dict:
571    def get_stats(self) -> dict:
572        """
573        The `get_stats` function calculates and returns various statistics of the current object,
574        including information about the input file, variants, samples, header fields, quality, and
575        SNVs/InDels.
576        :return: a dictionary containing various statistics of the current object. The dictionary has
577        the following structure:
578        """
579
580        # Log
581        log.info(f"Stats Calculation...")
582
583        # table varaints
584        table_variants_from = self.get_table_variants()
585
586        # stats dict
587        stats = {"Infos": {}}
588
589        ### File
590        input_file = self.get_input()
591        stats["Infos"]["Input file"] = input_file
592
593        # Header
594        header_infos = self.get_header().infos
595        header_formats = self.get_header().formats
596        header_infos_list = list(header_infos)
597        header_formats_list = list(header_formats)
598
599        ### Variants
600
601        stats["Variants"] = {}
602
603        # Variants by chr
604        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
605        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
606        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
607            by=["CHROM"], kind="quicksort"
608        )
609
610        # Total number of variants
611        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
612
613        # Calculate percentage
614        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
615            lambda x: (x / nb_of_variants)
616        )
617
618        stats["Variants"]["Number of variants by chromosome"] = (
619            nb_of_variants_by_chrom.to_dict(orient="index")
620        )
621
622        stats["Infos"]["Number of variants"] = int(nb_of_variants)
623
624        ### Samples
625
626        # Init
627        samples = {}
628        nb_of_samples = 0
629
630        # Check Samples
631        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
632            log.debug(f"Check samples...")
633            for sample in self.get_header_sample_list():
634                sql_query_samples = f"""
635                    SELECT  '{sample}' as sample,
636                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
637                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
638                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
639                    FROM {table_variants_from}
640                    WHERE (
641                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
642                        AND
643                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
644                      )
645                    GROUP BY genotype
646                    """
647                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
648                sample_genotype_count = sql_query_genotype_df["count"].sum()
649                if len(sql_query_genotype_df):
650                    nb_of_samples += 1
651                    samples[f"{sample} - {sample_genotype_count} variants"] = (
652                        sql_query_genotype_df.to_dict(orient="index")
653                    )
654
655            stats["Samples"] = samples
656            stats["Infos"]["Number of samples"] = nb_of_samples
657
658        # #
659        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
660        #     stats["Infos"]["Number of samples"] = nb_of_samples
661        # elif nb_of_samples:
662        #     stats["Infos"]["Number of samples"] = "not a VCF format"
663
664        ### INFO and FORMAT fields
665        header_types_df = {}
666        header_types_list = {
667            "List of INFO fields": header_infos,
668            "List of FORMAT fields": header_formats,
669        }
670        i = 0
671        for header_type in header_types_list:
672
673            header_type_infos = header_types_list.get(header_type)
674            header_infos_dict = {}
675
676            for info in header_type_infos:
677
678                i += 1
679                header_infos_dict[i] = {}
680
681                # ID
682                header_infos_dict[i]["id"] = info
683
684                # num
685                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
686                if header_type_infos[info].num in genotype_map.keys():
687                    header_infos_dict[i]["Number"] = genotype_map.get(
688                        header_type_infos[info].num
689                    )
690                else:
691                    header_infos_dict[i]["Number"] = header_type_infos[info].num
692
693                # type
694                if header_type_infos[info].type:
695                    header_infos_dict[i]["Type"] = header_type_infos[info].type
696                else:
697                    header_infos_dict[i]["Type"] = "."
698
699                # desc
700                if header_type_infos[info].desc != None:
701                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
702                else:
703                    header_infos_dict[i]["Description"] = ""
704
705            if len(header_infos_dict):
706                header_types_df[header_type] = pd.DataFrame.from_dict(
707                    header_infos_dict, orient="index"
708                ).to_dict(orient="index")
709
710        # Stats
711        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
712        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
713        stats["Header"] = header_types_df
714
715        ### QUAL
716        if "QUAL" in self.get_header_columns():
717            sql_query_qual = f"""
718                    SELECT
719                        avg(CAST(QUAL AS INTEGER)) AS Average,
720                        min(CAST(QUAL AS INTEGER)) AS Minimum,
721                        max(CAST(QUAL AS INTEGER)) AS Maximum,
722                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
723                        median(CAST(QUAL AS INTEGER)) AS Median,
724                        variance(CAST(QUAL AS INTEGER)) AS Variance
725                    FROM {table_variants_from}
726                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
727                    """
728
729            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
730            stats["Quality"] = {"Stats": qual}
731
732        ### SNV and InDel
733
734        sql_query_snv = f"""
735            
736            SELECT Type, count FROM (
737
738                    SELECT
739                        'Total' AS Type,
740                        count(*) AS count
741                    FROM {table_variants_from}
742
743                    UNION
744
745                    SELECT
746                        'MNV' AS Type,
747                        count(*) AS count
748                    FROM {table_variants_from}
749                    WHERE len(REF) > 1 AND len(ALT) > 1
750                    AND len(REF) = len(ALT)
751
752                    UNION
753
754                    SELECT
755                        'InDel' AS Type,
756                        count(*) AS count
757                    FROM {table_variants_from}
758                    WHERE len(REF) > 1 OR len(ALT) > 1
759                    AND len(REF) != len(ALT)
760                    
761                    UNION
762
763                    SELECT
764                        'SNV' AS Type,
765                        count(*) AS count
766                    FROM {table_variants_from}
767                    WHERE len(REF) = 1 AND len(ALT) = 1
768
769                )
770
771            ORDER BY count DESC
772
773                """
774        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
775
776        sql_query_snv_substitution = f"""
777                SELECT
778                    concat(REF, '>', ALT) AS 'Substitution',
779                    count(*) AS count
780                FROM {table_variants_from}
781                WHERE len(REF) = 1 AND len(ALT) = 1
782                GROUP BY REF, ALT
783                ORDER BY count(*) DESC
784                """
785        snv_substitution = (
786            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
787        )
788        stats["Variants"]["Counts"] = snv_indel
789        stats["Variants"]["Substitutions"] = snv_substitution
790
791        return stats

The get_stats function calculates and returns various statistics of the current object, including information about the input file, variants, samples, header fields, quality, and SNVs/InDels.

Returns

a dictionary containing various statistics of the current object. The dictionary has the following structure:

def stats_to_file(self, file: str = None) -> str:
793    def stats_to_file(self, file: str = None) -> str:
794        """
795        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
796        into a JSON object, and writes the JSON object to the specified file.
797
798        :param file: The `file` parameter is a string that represents the file path where the JSON data
799        will be written
800        :type file: str
801        :return: the name of the file that was written to.
802        """
803
804        # Get stats
805        stats = self.get_stats()
806
807        # Serializing json
808        json_object = json.dumps(stats, indent=4)
809
810        # Writing to sample.json
811        with open(file, "w") as outfile:
812            outfile.write(json_object)
813
814        return file

The function stats_to_file takes a file name as input, retrieves statistics, serializes them into a JSON object, and writes the JSON object to the specified file.

Parameters
  • file: The file parameter is a string that represents the file path where the JSON data will be written
Returns

the name of the file that was written to.

def print_stats(self, output_file: str = None, json_file: str = None) -> None:
816    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
817        """
818        The `print_stats` function generates a markdown file and prints the statistics contained in a
819        JSON file in a formatted manner.
820
821        :param output_file: The `output_file` parameter is a string that specifies the path and filename
822        of the output file where the stats will be printed in Markdown format. If no `output_file` is
823        provided, a temporary directory will be created and the stats will be saved in a file named
824        "stats.md" within that
825        :type output_file: str
826        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
827        file where the statistics will be saved. If no value is provided, a temporary directory will be
828        created and a default file name "stats.json" will be used
829        :type json_file: str
830        :return: The function `print_stats` does not return any value. It has a return type annotation
831        of `None`.
832        """
833
834        # Full path
835        output_file = full_path(output_file)
836        json_file = full_path(json_file)
837
838        with tempfile.TemporaryDirectory() as tmpdir:
839
840            # Files
841            if not output_file:
842                output_file = os.path.join(tmpdir, "stats.md")
843            if not json_file:
844                json_file = os.path.join(tmpdir, "stats.json")
845
846            # Create folders
847            if not os.path.exists(os.path.dirname(output_file)):
848                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
849            if not os.path.exists(os.path.dirname(json_file)):
850                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
851
852            # Create stats JSON file
853            stats_file = self.stats_to_file(file=json_file)
854
855            # Print stats file
856            with open(stats_file) as f:
857                stats = yaml.safe_load(f)
858
859            # Output
860            output_title = []
861            output_index = []
862            output = []
863
864            # Title
865            output_title.append("# HOWARD Stats")
866
867            # Index
868            output_index.append("## Index")
869
870            # Process sections
871            for section in stats:
872                infos = stats.get(section)
873                section_link = "#" + section.lower().replace(" ", "-")
874                output.append(f"## {section}")
875                output_index.append(f"- [{section}]({section_link})")
876
877                if len(infos):
878                    for info in infos:
879                        try:
880                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
881                            is_df = True
882                        except:
883                            try:
884                                df = pd.DataFrame.from_dict(
885                                    json.loads((infos.get(info))), orient="index"
886                                )
887                                is_df = True
888                            except:
889                                is_df = False
890                        if is_df:
891                            output.append(f"### {info}")
892                            info_link = "#" + info.lower().replace(" ", "-")
893                            output_index.append(f"   - [{info}]({info_link})")
894                            output.append(f"{df.to_markdown(index=False)}")
895                        else:
896                            output.append(f"- {info}: {infos.get(info)}")
897                else:
898                    output.append(f"NA")
899
900            # Write stats in markdown file
901            with open(output_file, "w") as fp:
902                for item in output_title:
903                    fp.write("%s\n" % item)
904                for item in output_index:
905                    fp.write("%s\n" % item)
906                for item in output:
907                    fp.write("%s\n" % item)
908
909            # Output stats in markdown
910            print("")
911            print("\n\n".join(output_title))
912            print("")
913            print("\n\n".join(output))
914            print("")
915
916        return None

The print_stats function generates a markdown file and prints the statistics contained in a JSON file in a formatted manner.

Parameters
  • output_file: The output_file parameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If no output_file is provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that
  • json_file: The json_file parameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns

The function print_stats does not return any value. It has a return type annotation of None.

def get_input(self) -> str:
918    def get_input(self) -> str:
919        """
920        It returns the value of the input variable.
921        :return: The input is being returned.
922        """
923        return self.input

It returns the value of the input variable.

Returns

The input is being returned.

def get_input_format(self, input_file: str = None) -> str:
925    def get_input_format(self, input_file: str = None) -> str:
926        """
927        This function returns the format of the input variable, either from the provided input file or
928        by prompting for input.
929
930        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
931        represents the file path of the input file. If no `input_file` is provided when calling the
932        method, it will default to `None`
933        :type input_file: str
934        :return: The format of the input variable is being returned.
935        """
936
937        if not input_file:
938            input_file = self.get_input()
939        input_format = get_file_format(input_file)
940        return input_format

This function returns the format of the input variable, either from the provided input file or by prompting for input.

Parameters
  • input_file: The input_file parameter in the get_input_format method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None
Returns

The format of the input variable is being returned.

def get_input_compressed(self, input_file: str = None) -> str:
942    def get_input_compressed(self, input_file: str = None) -> str:
943        """
944        The function `get_input_compressed` returns the format of the input variable after compressing
945        it.
946
947        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
948        that represents the file path of the input file. If no `input_file` is provided when calling the
949        method, it will default to `None` and the method will then call `self.get_input()` to
950        :type input_file: str
951        :return: The function `get_input_compressed` returns the compressed format of the input
952        variable.
953        """
954
955        if not input_file:
956            input_file = self.get_input()
957        input_compressed = get_file_compressed(input_file)
958        return input_compressed

The function get_input_compressed returns the format of the input variable after compressing it.

Parameters
  • input_file: The input_file parameter in the get_input_compressed method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None and the method will then call self.get_input() to
Returns

The function get_input_compressed returns the compressed format of the input variable.

def get_output(self) -> str:
960    def get_output(self) -> str:
961        """
962        It returns the output of the neuron.
963        :return: The output of the neural network.
964        """
965
966        return self.output

It returns the output of the neuron.

Returns

The output of the neural network.

def get_output_format(self, output_file: str = None) -> str:
968    def get_output_format(self, output_file: str = None) -> str:
969        """
970        The function `get_output_format` returns the format of the input variable or the output file if
971        provided.
972
973        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
974        that represents the file path of the output file. If no `output_file` is provided when calling
975        the method, it will default to the output obtained from the `get_output` method of the class
976        instance. The
977        :type output_file: str
978        :return: The format of the input variable is being returned.
979        """
980
981        if not output_file:
982            output_file = self.get_output()
983        output_format = get_file_format(output_file)
984
985        return output_format

The function get_output_format returns the format of the input variable or the output file if provided.

Parameters
  • output_file: The output_file parameter in the get_output_format method is a string that represents the file path of the output file. If no output_file is provided when calling the method, it will default to the output obtained from the get_output method of the class instance. The
Returns

The format of the input variable is being returned.

def get_config(self) -> dict:
987    def get_config(self) -> dict:
988        """
989        It returns the config
990        :return: The config variable is being returned.
991        """
992        return self.config

It returns the config

Returns

The config variable is being returned.

def get_param(self) -> dict:
994    def get_param(self) -> dict:
995        """
996        It returns the param
997        :return: The param variable is being returned.
998        """
999        return self.param

It returns the param

Returns

The param variable is being returned.

def get_connexion_db(self) -> str:
1001    def get_connexion_db(self) -> str:
1002        """
1003        It returns the connexion_db attribute of the object
1004        :return: The connexion_db is being returned.
1005        """
1006        return self.connexion_db

It returns the connexion_db attribute of the object

Returns

The connexion_db is being returned.

def get_prefix(self) -> str:
1008    def get_prefix(self) -> str:
1009        """
1010        It returns the prefix of the object.
1011        :return: The prefix is being returned.
1012        """
1013        return self.prefix

It returns the prefix of the object.

Returns

The prefix is being returned.

def get_table_variants(self, clause: str = 'select') -> str:
1015    def get_table_variants(self, clause: str = "select") -> str:
1016        """
1017        This function returns the table_variants attribute of the object
1018
1019        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
1020        defaults to select (optional)
1021        :return: The table_variants attribute of the object.
1022        """
1023
1024        # Access
1025        access = self.get_config().get("access", None)
1026
1027        # Clauses "select", "where", "update"
1028        if clause in ["select", "where", "update"]:
1029            table_variants = self.table_variants
1030        # Clause "from"
1031        elif clause in ["from"]:
1032            # For Read Only
1033            if self.get_input_format() in ["parquet"] and access in ["RO"]:
1034                input_file = self.get_input()
1035                table_variants = f"'{input_file}' as variants"
1036            # For Read Write
1037            else:
1038                table_variants = f"{self.table_variants} as variants"
1039        else:
1040            table_variants = self.table_variants
1041        return table_variants

This function returns the table_variants attribute of the object

Parameters
  • clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns

The table_variants attribute of the object.

def get_tmp_dir(self) -> str:
1043    def get_tmp_dir(self) -> str:
1044        """
1045        The function `get_tmp_dir` returns the temporary directory path based on configuration
1046        parameters or a default path.
1047        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
1048        configuration, parameters, and a default value of "/tmp".
1049        """
1050
1051        return get_tmp(
1052            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
1053        )

The function get_tmp_dir returns the temporary directory path based on configuration parameters or a default path.

Returns

The get_tmp_dir method is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".

def get_connexion_type(self) -> str:
1055    def get_connexion_type(self) -> str:
1056        """
1057        If the connexion type is not in the list of allowed connexion types, raise a ValueError
1058
1059        :return: The connexion type is being returned.
1060        """
1061        return self.get_config().get("connexion_type", "memory")

If the connexion type is not in the list of allowed connexion types, raise a ValueError

Returns

The connexion type is being returned.

def get_connexion(self):
1063    def get_connexion(self):
1064        """
1065        It returns the connection object
1066
1067        :return: The connection object.
1068        """
1069        return self.conn

It returns the connection object

Returns

The connection object.

def close_connexion(self) -> None:
1071    def close_connexion(self) -> None:
1072        """
1073        This function closes the connection to the database.
1074        :return: The connection is being closed.
1075        """
1076        return self.conn.close()

This function closes the connection to the database.

Returns

The connection is being closed.

def get_header(self, type: str = 'vcf'):
1078    def get_header(self, type: str = "vcf"):
1079        """
1080        This function returns the header of the VCF file as a list of strings
1081
1082        :param type: the type of header you want to get, defaults to vcf (optional)
1083        :return: The header of the vcf file.
1084        """
1085
1086        if self.header_vcf:
1087            if type == "vcf":
1088                return self.header_vcf
1089            elif type == "list":
1090                return self.header_list
1091        else:
1092            if type == "vcf":
1093                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
1094                return header
1095            elif type == "list":
1096                return vcf_required

This function returns the header of the VCF file as a list of strings

Parameters
  • type: the type of header you want to get, defaults to vcf (optional)
Returns

The header of the vcf file.

def get_header_infos_list(self) -> list:
1098    def get_header_infos_list(self) -> list:
1099        """
1100        This function retrieves a list of information fields from the header.
1101        :return: A list of information fields from the header.
1102        """
1103
1104        # Init
1105        infos_list = []
1106
1107        for field in self.get_header().infos:
1108            infos_list.append(field)
1109
1110        return infos_list

This function retrieves a list of information fields from the header.

Returns

A list of information fields from the header.

def get_header_length(self, file: str = None) -> int:
1112    def get_header_length(self, file: str = None) -> int:
1113        """
1114        The function `get_header_length` returns the length of the header list, excluding the #CHROM
1115        line.
1116
1117        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
1118        header file. If this argument is provided, the function will read the header from the specified
1119        file and return the length of the header list minus 1 (to exclude the #CHROM line)
1120        :type file: str
1121        :return: the length of the header list, excluding the #CHROM line.
1122        """
1123
1124        if file:
1125            return len(self.read_vcf_header_file(file=file)) - 1
1126        elif self.get_header(type="list"):
1127            return len(self.get_header(type="list")) - 1
1128        else:
1129            return 0

The function get_header_length returns the length of the header list, excluding the #CHROM line.

Parameters
  • file: The file parameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns

the length of the header list, excluding the #CHROM line.

def get_header_columns(self) -> str:
1131    def get_header_columns(self) -> str:
1132        """
1133        This function returns the header list of a VCF
1134
1135        :return: The length of the header list.
1136        """
1137        if self.get_header():
1138            return self.get_header(type="list")[-1]
1139        else:
1140            return ""

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_list(self) -> list:
1142    def get_header_columns_as_list(self) -> list:
1143        """
1144        This function returns the header list of a VCF
1145
1146        :return: The length of the header list.
1147        """
1148        if self.get_header():
1149            return self.get_header_columns().strip().split("\t")
1150        else:
1151            return []

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_sql(self) -> str:
1153    def get_header_columns_as_sql(self) -> str:
1154        """
1155        This function retruns header length (without #CHROM line)
1156
1157        :return: The length of the header list.
1158        """
1159        sql_column_list = []
1160        for col in self.get_header_columns_as_list():
1161            sql_column_list.append(f'"{col}"')
1162        return ",".join(sql_column_list)

This function retruns header length (without #CHROM line)

Returns

The length of the header list.

def get_header_sample_list( self, check: bool = False, samples: list = None, samples_force: bool = False) -> list:
1164    def get_header_sample_list(
1165        self, check: bool = False, samples: list = None, samples_force: bool = False
1166    ) -> list:
1167        """
1168        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
1169        checking and filtering based on input parameters.
1170
1171        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
1172        parameter that determines whether to check if the samples in the list are properly defined as
1173        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
1174        list is defined as a, defaults to False
1175        :type check: bool (optional)
1176        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
1177        allows you to specify a subset of samples from the header. If you provide a list of sample
1178        names, the function will check if each sample is defined in the header. If a sample is not found
1179        in the
1180        :type samples: list
1181        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
1182        a boolean parameter that determines whether to force the function to return the sample list
1183        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
1184        function will return the sample list without performing, defaults to False
1185        :type samples_force: bool (optional)
1186        :return: The function `get_header_sample_list` returns a list of samples based on the input
1187        parameters and conditions specified in the function.
1188        """
1189
1190        # Init
1191        samples_list = []
1192
1193        if samples is None:
1194            samples_list = self.header_vcf.samples
1195        else:
1196            samples_checked = []
1197            for sample in samples:
1198                if sample in self.header_vcf.samples:
1199                    samples_checked.append(sample)
1200                else:
1201                    log.warning(f"Sample '{sample}' not defined in header")
1202            samples_list = samples_checked
1203
1204            # Force sample list without checking if is_genotype_column
1205            if samples_force:
1206                log.warning(f"Samples {samples_list} not checked if genotypes")
1207                return samples_list
1208
1209        if check:
1210            samples_checked = []
1211            for sample in samples_list:
1212                if self.is_genotype_column(column=sample):
1213                    samples_checked.append(sample)
1214                else:
1215                    log.warning(
1216                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
1217                    )
1218            samples_list = samples_checked
1219
1220        # Return samples list
1221        return samples_list

The function get_header_sample_list returns a list of samples from a VCF header, with optional checking and filtering based on input parameters.

Parameters
  • check: The check parameter in the get_header_sample_list function is a boolean parameter that determines whether to check if the samples in the list are properly defined as genotype columns. If check is set to True, the function will verify if each sample in the list is defined as a, defaults to False
  • samples: The samples parameter in the get_header_sample_list function is a list that allows you to specify a subset of samples from the header. If you provide a list of sample names, the function will check if each sample is defined in the header. If a sample is not found in the
  • samples_force: The samples_force parameter in the get_header_sample_list function is a boolean parameter that determines whether to force the function to return the sample list without checking if the samples are genotype columns. If samples_force is set to True, the function will return the sample list without performing, defaults to False
Returns

The function get_header_sample_list returns a list of samples based on the input parameters and conditions specified in the function.

def is_genotype_column(self, column: str = None) -> bool:
1223    def is_genotype_column(self, column: str = None) -> bool:
1224        """
1225        This function checks if a given column is a genotype column in a database.
1226
1227        :param column: The `column` parameter in the `is_genotype_column` method is a string that
1228        represents the column name in a database table. This method checks if the specified column is a
1229        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
1230        method of
1231        :type column: str
1232        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
1233        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
1234        column name and returns the result. If the `column` parameter is None, it returns False.
1235        """
1236
1237        if column is not None:
1238            return Database(database=self.get_input()).is_genotype_column(column=column)
1239        else:
1240            return False

This function checks if a given column is a genotype column in a database.

Parameters
  • column: The column parameter in the is_genotype_column method is a string that represents the column name in a database table. This method checks if the specified column is a genotype column in the database. If a column name is provided, it calls the is_genotype_column method of
Returns

The is_genotype_column method is returning a boolean value. If the column parameter is not None, it calls the is_genotype_column method of the Database class with the specified column name and returns the result. If the column parameter is None, it returns False.

def get_verbose(self) -> bool:
1242    def get_verbose(self) -> bool:
1243        """
1244        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
1245        exist
1246
1247        :return: The value of the key "verbose" in the config dictionary.
1248        """
1249        return self.get_config().get("verbose", False)

It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist

Returns

The value of the key "verbose" in the config dictionary.

def get_connexion_format(self) -> str:
1251    def get_connexion_format(self) -> str:
1252        """
1253        It returns the connexion format of the object.
1254        :return: The connexion_format is being returned.
1255        """
1256        connexion_format = self.connexion_format
1257        if connexion_format not in ["duckdb", "sqlite"]:
1258            log.error(f"Unknown connexion format {connexion_format}")
1259            raise ValueError(f"Unknown connexion format {connexion_format}")
1260        else:
1261            return connexion_format

It returns the connexion format of the object.

Returns

The connexion_format is being returned.

def insert_file_to_table( self, file, columns: str, header_len: int = 0, sep: str = '\t', chunksize: int = 1000000) -> None:
1263    def insert_file_to_table(
1264        self,
1265        file,
1266        columns: str,
1267        header_len: int = 0,
1268        sep: str = "\t",
1269        chunksize: int = 1000000,
1270    ) -> None:
1271        """
1272        The function reads a file in chunks and inserts each chunk into a table based on the specified
1273        database format.
1274
1275        :param file: The `file` parameter is the file that you want to load into a table. It should be
1276        the path to the file on your system
1277        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
1278        should contain the names of the columns in the table where the data will be inserted. The column
1279        names should be separated by commas within the string. For example, if you have columns named
1280        "id", "name
1281        :type columns: str
1282        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
1283        the number of lines to skip at the beginning of the file before reading the actual data. This
1284        parameter allows you to skip any header information present in the file before processing the
1285        data, defaults to 0
1286        :type header_len: int (optional)
1287        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
1288        separator character that is used in the file being read. In this case, the default separator is
1289        set to `\t`, which represents a tab character. You can change this parameter to a different
1290        separator character if, defaults to \t
1291        :type sep: str (optional)
1292        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
1293        when processing the file in chunks. In the provided code snippet, the default value for
1294        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
1295        to 1000000
1296        :type chunksize: int (optional)
1297        """
1298
1299        # Config
1300        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
1301        connexion_format = self.get_connexion_format()
1302
1303        log.debug("chunksize: " + str(chunksize))
1304
1305        if chunksize:
1306            for chunk in pd.read_csv(
1307                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
1308            ):
1309                if connexion_format in ["duckdb"]:
1310                    sql_insert_into = (
1311                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
1312                    )
1313                    self.conn.execute(sql_insert_into)
1314                elif connexion_format in ["sqlite"]:
1315                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)

The function reads a file in chunks and inserts each chunk into a table based on the specified database format.

Parameters
  • file: The file parameter is the file that you want to load into a table. It should be the path to the file on your system
  • columns: The columns parameter in the insert_file_to_table function is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name
  • header_len: The header_len parameter in the insert_file_to_table function specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0
  • sep: The sep parameter in the insert_file_to_table function is used to specify the separator character that is used in the file being read. In this case, the default separator is set to , which represents a tab character. You can change this parameter to a different separator character if, defaults to
  • chunksize: The chunksize parameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value for chunksize is set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
def load_data( self, input_file: str = None, drop_variants_table: bool = False, sample_size: int = 20480) -> None:
1317    def load_data(
1318        self,
1319        input_file: str = None,
1320        drop_variants_table: bool = False,
1321        sample_size: int = 20480,
1322    ) -> None:
1323        """
1324        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
1325        table before loading the data and specify a sample size.
1326
1327        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
1328        table
1329        :type input_file: str
1330        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
1331        determines whether the variants table should be dropped before loading the data. If set to
1332        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
1333        not be dropped, defaults to False
1334        :type drop_variants_table: bool (optional)
1335        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
1336        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
1337        20480
1338        :type sample_size: int (optional)
1339        """
1340
1341        log.info("Loading...")
1342
1343        # change input file
1344        if input_file:
1345            self.set_input(input_file)
1346            self.set_header()
1347
1348        # drop variants table
1349        if drop_variants_table:
1350            self.drop_variants_table()
1351
1352        # get table variants
1353        table_variants = self.get_table_variants()
1354
1355        # Access
1356        access = self.get_config().get("access", None)
1357        log.debug(f"access: {access}")
1358
1359        # Input format and compress
1360        input_format = self.get_input_format()
1361        input_compressed = self.get_input_compressed()
1362        log.debug(f"input_format: {input_format}")
1363        log.debug(f"input_compressed: {input_compressed}")
1364
1365        # input_compressed_format
1366        if input_compressed:
1367            input_compressed_format = "gzip"
1368        else:
1369            input_compressed_format = "none"
1370        log.debug(f"input_compressed_format: {input_compressed_format}")
1371
1372        # Connexion format
1373        connexion_format = self.get_connexion_format()
1374
1375        # Sample size
1376        if not sample_size:
1377            sample_size = -1
1378        log.debug(f"sample_size: {sample_size}")
1379
1380        # Load data
1381        log.debug(f"Load Data from {input_format}")
1382
1383        # DuckDB connexion
1384        if connexion_format in ["duckdb"]:
1385
1386            # Database already exists
1387            if self.input_format in ["db", "duckdb"]:
1388
1389                if connexion_format in ["duckdb"]:
1390                    log.debug(f"Input file format '{self.input_format}' duckDB")
1391                else:
1392                    log.error(
1393                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1394                    )
1395                    raise ValueError(
1396                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1397                    )
1398
1399            # Load from existing database format
1400            else:
1401
1402                try:
1403                    # Create Table or View
1404                    database = Database(database=self.input)
1405                    sql_from = database.get_sql_from(sample_size=sample_size)
1406
1407                    if access in ["RO"]:
1408                        sql_load = (
1409                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
1410                        )
1411                    else:
1412                        sql_load = (
1413                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
1414                        )
1415                    self.conn.execute(sql_load)
1416
1417                except:
1418                    # Format not available
1419                    log.error(f"Input file format '{self.input_format}' not available")
1420                    raise ValueError(
1421                        f"Input file format '{self.input_format}' not available"
1422                    )
1423
1424        # SQLite connexion
1425        elif connexion_format in ["sqlite"] and input_format in [
1426            "vcf",
1427            "tsv",
1428            "csv",
1429            "psv",
1430        ]:
1431
1432            # Main structure
1433            structure = {
1434                "#CHROM": "VARCHAR",
1435                "POS": "INTEGER",
1436                "ID": "VARCHAR",
1437                "REF": "VARCHAR",
1438                "ALT": "VARCHAR",
1439                "QUAL": "VARCHAR",
1440                "FILTER": "VARCHAR",
1441                "INFO": "VARCHAR",
1442            }
1443
1444            # Strcuture with samples
1445            structure_complete = structure
1446            if self.get_header_sample_list():
1447                structure["FORMAT"] = "VARCHAR"
1448                for sample in self.get_header_sample_list():
1449                    structure_complete[sample] = "VARCHAR"
1450
1451            # Columns list for create and insert
1452            sql_create_table_columns = []
1453            sql_create_table_columns_list = []
1454            for column in structure_complete:
1455                column_type = structure_complete[column]
1456                sql_create_table_columns.append(
1457                    f'"{column}" {column_type} default NULL'
1458                )
1459                sql_create_table_columns_list.append(f'"{column}"')
1460
1461            # Create database
1462            log.debug(f"Create Table {table_variants}")
1463            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
1464            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
1465            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
1466            self.conn.execute(sql_create_table)
1467
1468            # chunksize define length of file chunk load file
1469            chunksize = 100000
1470
1471            # delimiter
1472            delimiter = file_format_delimiters.get(input_format, "\t")
1473
1474            # Load the input file
1475            with open(self.input, "rt") as input_file:
1476
1477                # Use the appropriate file handler based on the input format
1478                if input_compressed:
1479                    input_file = bgzf.open(self.input, "rt")
1480                if input_format in ["vcf"]:
1481                    header_len = self.get_header_length()
1482                else:
1483                    header_len = 0
1484
1485                # Insert the file contents into a table
1486                self.insert_file_to_table(
1487                    input_file,
1488                    columns=sql_create_table_columns_list_sql,
1489                    header_len=header_len,
1490                    sep=delimiter,
1491                    chunksize=chunksize,
1492                )
1493
1494        else:
1495            log.error(
1496                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1497            )
1498            raise ValueError(
1499                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1500            )
1501
1502        # Explode INFOS fields into table fields
1503        if self.get_explode_infos():
1504            self.explode_infos(
1505                prefix=self.get_explode_infos_prefix(),
1506                fields=self.get_explode_infos_fields(),
1507                force=True,
1508            )
1509
1510        # Create index after insertion
1511        self.create_indexes()

The load_data function reads a VCF file and inserts it into a table, with options to drop the table before loading the data and specify a sample size.

Parameters
  • input_file: The path to the input file. This is the VCF file that will be loaded into the table
  • drop_variants_table: The drop_variants_table parameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set to True, the variants table will be dropped. If set to False (default), the variants table will not be dropped, defaults to False
  • sample_size: The sample_size parameter determines the number of rows to be sampled from the input file. If it is set to None, the default value of 20480 will be used, defaults to 20480
def get_explode_infos(self) -> bool:
1513    def get_explode_infos(self) -> bool:
1514        """
1515        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
1516        to False if it is not set.
1517        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
1518        value. If the parameter is not present, it will return False.
1519        """
1520
1521        return self.get_param().get("explode", {}).get("explode_infos", False)

The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting to False if it is not set.

Returns

The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.

def get_explode_infos_fields( self, explode_infos_fields: str = None, remove_fields_not_in_header: bool = False) -> list:
1523    def get_explode_infos_fields(
1524        self,
1525        explode_infos_fields: str = None,
1526        remove_fields_not_in_header: bool = False,
1527    ) -> list:
1528        """
1529        The `get_explode_infos_fields` function returns a list of exploded information fields based on
1530        the input parameter `explode_infos_fields`.
1531
1532        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
1533        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
1534        comma-separated list of field names to explode
1535        :type explode_infos_fields: str
1536        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
1537        flag that determines whether to remove fields that are not present in the header. If it is set
1538        to `True`, any field that is not in the header will be excluded from the list of exploded
1539        information fields. If it is set to `, defaults to False
1540        :type remove_fields_not_in_header: bool (optional)
1541        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
1542        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
1543        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
1544        Otherwise, it returns a list of exploded information fields after removing any spaces and
1545        splitting the string by commas.
1546        """
1547
1548        # If no fields, get it in param
1549        if not explode_infos_fields:
1550            explode_infos_fields = (
1551                self.get_param().get("explode", {}).get("explode_infos_fields", None)
1552            )
1553
1554        # If no fields, defined as all fields in header using keyword
1555        if not explode_infos_fields:
1556            explode_infos_fields = "*"
1557
1558        # If fields list not empty
1559        if explode_infos_fields:
1560
1561            # Input fields list
1562            if isinstance(explode_infos_fields, str):
1563                fields_input = explode_infos_fields.split(",")
1564            elif isinstance(explode_infos_fields, list):
1565                fields_input = explode_infos_fields
1566            else:
1567                fields_input = []
1568
1569            # Fields list without * keyword
1570            fields_without_all = fields_input.copy()
1571            if "*".casefold() in (item.casefold() for item in fields_without_all):
1572                fields_without_all.remove("*")
1573
1574            # Fields in header
1575            fields_in_header = sorted(list(set(self.get_header().infos)))
1576
1577            # Construct list of fields
1578            fields_output = []
1579            for field in fields_input:
1580
1581                # Strip field
1582                field = field.strip()
1583
1584                # format keyword * in regex
1585                if field.upper() in ["*"]:
1586                    field = ".*"
1587
1588                # Find all fields with pattern
1589                r = re.compile(rf"^{field}$")
1590                fields_search = sorted(list(filter(r.match, fields_in_header)))
1591
1592                # Remove fields input from search
1593                if field in fields_search:
1594                    fields_search = [field]
1595                elif fields_search != [field]:
1596                    fields_search = sorted(
1597                        list(set(fields_search).difference(fields_input))
1598                    )
1599
1600                # If field is not in header (avoid not well formatted header)
1601                if not fields_search and not remove_fields_not_in_header:
1602                    fields_search = [field]
1603
1604                # Add found fields
1605                for new_field in fields_search:
1606                    # Add field, if not already exists, and if it is in header (if asked)
1607                    if (
1608                        new_field not in fields_output
1609                        and (
1610                            not remove_fields_not_in_header
1611                            or new_field in fields_in_header
1612                        )
1613                        and new_field not in [".*"]
1614                    ):
1615                        fields_output.append(new_field)
1616
1617            return fields_output
1618
1619        else:
1620
1621            return []

The get_explode_infos_fields function returns a list of exploded information fields based on the input parameter explode_infos_fields.

Parameters
  • explode_infos_fields: The explode_infos_fields parameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode
  • remove_fields_not_in_header: The parameter remove_fields_not_in_header is a boolean flag that determines whether to remove fields that are not present in the header. If it is set to True, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns

The function get_explode_infos_fields returns a list of exploded information fields. If the explode_infos_fields parameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.

def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1623    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1624        """
1625        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
1626        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
1627        not provided.
1628
1629        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
1630        prefix to be used for exploding or expanding information
1631        :type explode_infos_prefix: str
1632        :return: the value of the variable `explode_infos_prefix`.
1633        """
1634
1635        if not explode_infos_prefix:
1636            explode_infos_prefix = (
1637                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
1638            )
1639
1640        return explode_infos_prefix

The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is not provided.

Parameters
  • explode_infos_prefix: The parameter explode_infos_prefix is a string that specifies a prefix to be used for exploding or expanding information
Returns

the value of the variable explode_infos_prefix.

def add_column( self, table_name, column_name, column_type, default_value=None, drop: bool = False) -> dict:
1642    def add_column(
1643        self,
1644        table_name,
1645        column_name,
1646        column_type,
1647        default_value=None,
1648        drop: bool = False,
1649    ) -> dict:
1650        """
1651        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
1652        doesn't already exist.
1653
1654        :param table_name: The name of the table to which you want to add a column
1655        :param column_name: The parameter "column_name" is the name of the column that you want to add
1656        to the table
1657        :param column_type: The `column_type` parameter specifies the data type of the column that you
1658        want to add to the table. It should be a string that represents the desired data type, such as
1659        "INTEGER", "TEXT", "REAL", etc
1660        :param default_value: The `default_value` parameter is an optional parameter that specifies the
1661        default value for the newly added column. If a default value is provided, it will be assigned to
1662        the column for any existing rows that do not have a value for that column
1663        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
1664        if it already exists in the table. If `drop` is set to `True`, the function will drop the
1665        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
1666        to False
1667        :type drop: bool (optional)
1668        :return: a boolean value indicating whether the column was successfully added to the table.
1669        """
1670
1671        # added
1672        added = False
1673        dropped = False
1674
1675        # Check if the column already exists in the table
1676        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1677        columns = self.get_query_to_df(query).columns.tolist()
1678        if column_name.upper() in [c.upper() for c in columns]:
1679            log.debug(
1680                f"The {column_name} column already exists in the {table_name} table"
1681            )
1682            if drop:
1683                self.drop_column(table_name=table_name, column_name=column_name)
1684                dropped = True
1685            else:
1686                return None
1687        else:
1688            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1689
1690        # Add column in table
1691        add_column_query = (
1692            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
1693        )
1694        if default_value is not None:
1695            add_column_query += f" DEFAULT {default_value}"
1696        self.execute_query(add_column_query)
1697        added = not dropped
1698        log.debug(
1699            f"The {column_name} column was successfully added to the {table_name} table"
1700        )
1701
1702        if added:
1703            added_column = {
1704                "table_name": table_name,
1705                "column_name": column_name,
1706                "column_type": column_type,
1707                "default_value": default_value,
1708            }
1709        else:
1710            added_column = None
1711
1712        return added_column

The add_column function adds a column to a SQLite or DuckDB table with a default value if it doesn't already exist.

Parameters
  • table_name: The name of the table to which you want to add a column
  • column_name: The parameter "column_name" is the name of the column that you want to add to the table
  • column_type: The column_type parameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc
  • default_value: The default_value parameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column
  • drop: The drop parameter is a boolean flag that determines whether to drop the column if it already exists in the table. If drop is set to True, the function will drop the existing column before adding the new column. If drop is set to False (default),, defaults to False
Returns

a boolean value indicating whether the column was successfully added to the table.

def drop_column( self, column: dict = None, table_name: str = None, column_name: str = None) -> bool:
1714    def drop_column(
1715        self, column: dict = None, table_name: str = None, column_name: str = None
1716    ) -> bool:
1717        """
1718        The `drop_column` function drops a specified column from a given table in a database and returns
1719        True if the column was successfully dropped, and False if the column does not exist in the
1720        table.
1721
1722        :param column: The `column` parameter is a dictionary that contains information about the column
1723        you want to drop. It has two keys:
1724        :type column: dict
1725        :param table_name: The `table_name` parameter is the name of the table from which you want to
1726        drop a column
1727        :type table_name: str
1728        :param column_name: The `column_name` parameter is the name of the column that you want to drop
1729        from the table
1730        :type column_name: str
1731        :return: a boolean value. It returns True if the column was successfully dropped from the table,
1732        and False if the column does not exist in the table.
1733        """
1734
1735        # Find column infos
1736        if column:
1737            if isinstance(column, dict):
1738                table_name = column.get("table_name", None)
1739                column_name = column.get("column_name", None)
1740            elif isinstance(column, str):
1741                table_name = self.get_table_variants()
1742                column_name = column
1743            else:
1744                table_name = None
1745                column_name = None
1746
1747        if not table_name and not column_name:
1748            return False
1749
1750        # Removed
1751        removed = False
1752
1753        # Check if the column already exists in the table
1754        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1755        columns = self.get_query_to_df(query).columns.tolist()
1756        if column_name in columns:
1757            log.debug(f"The {column_name} column exists in the {table_name} table")
1758        else:
1759            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1760            return False
1761
1762        # Add column in table # ALTER TABLE integers DROP k
1763        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
1764        self.execute_query(add_column_query)
1765        removed = True
1766        log.debug(
1767            f"The {column_name} column was successfully dropped to the {table_name} table"
1768        )
1769
1770        return removed

The drop_column function drops a specified column from a given table in a database and returns True if the column was successfully dropped, and False if the column does not exist in the table.

Parameters
  • column: The column parameter is a dictionary that contains information about the column you want to drop. It has two keys:
  • table_name: The table_name parameter is the name of the table from which you want to drop a column
  • column_name: The column_name parameter is the name of the column that you want to drop from the table
Returns

a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.

def explode_infos( self, prefix: str = None, create_index: bool = False, fields: list = None, force: bool = False, proccess_all_fields_together: bool = False, table: str = None) -> list:
1772    def explode_infos(
1773        self,
1774        prefix: str = None,
1775        create_index: bool = False,
1776        fields: list = None,
1777        force: bool = False,
1778        proccess_all_fields_together: bool = False,
1779        table: str = None,
1780    ) -> list:
1781        """
1782        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
1783        individual columns, returning a list of added columns.
1784
1785        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
1786        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
1787        `self.get_explode_infos_prefix()` as the prefix
1788        :type prefix: str
1789        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
1790        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
1791        `False`, indexes will not be created. The default value is `False`, defaults to False
1792        :type create_index: bool (optional)
1793        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
1794        that you want to explode into individual columns. If this parameter is not provided, all INFO
1795        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
1796        a list to the `
1797        :type fields: list
1798        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
1799        determines whether to drop and recreate a column if it already exists in the table. If `force`
1800        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
1801        defaults to False
1802        :type force: bool (optional)
1803        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
1804        flag that determines whether to process all the INFO fields together or individually. If set to
1805        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
1806        be processed individually. The default value is, defaults to False
1807        :type proccess_all_fields_together: bool (optional)
1808        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
1809        of the table where the exploded INFO fields will be added as individual columns. If you provide
1810        a value for the `table` parameter, the function will use that table name. If the `table`
1811        parameter is
1812        :type table: str
1813        :return: The `explode_infos` function returns a list of added columns.
1814        """
1815
1816        # drop indexes
1817        self.drop_indexes()
1818
1819        # connexion format
1820        connexion_format = self.get_connexion_format()
1821
1822        # Access
1823        access = self.get_config().get("access", None)
1824
1825        # Added columns
1826        added_columns = []
1827
1828        if access not in ["RO"]:
1829
1830            # prefix
1831            if prefix in [None, True] or not isinstance(prefix, str):
1832                if self.get_explode_infos_prefix() not in [None, True]:
1833                    prefix = self.get_explode_infos_prefix()
1834                else:
1835                    prefix = "INFO/"
1836
1837            # table variants
1838            if table is not None:
1839                table_variants = table
1840            else:
1841                table_variants = self.get_table_variants(clause="select")
1842
1843            # extra infos
1844            try:
1845                extra_infos = self.get_extra_infos()
1846            except:
1847                extra_infos = []
1848
1849            # Header infos
1850            header_infos = self.get_header().infos
1851
1852            log.debug(
1853                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
1854            )
1855
1856            sql_info_alter_table_array = []
1857
1858            # Info fields to check
1859            fields_list = list(header_infos)
1860            if fields:
1861                fields_list += fields
1862            fields_list = set(fields_list)
1863
1864            # If no fields
1865            if not fields:
1866                fields = []
1867
1868            # Translate fields if patterns
1869            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
1870
1871            for info in fields:
1872
1873                info_id_sql = prefix + info
1874
1875                if (
1876                    info in fields_list
1877                    or prefix + info in fields_list
1878                    or info in extra_infos
1879                ):
1880
1881                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
1882
1883                    if info in header_infos:
1884                        info_type = header_infos[info].type
1885                        info_num = header_infos[info].num
1886                    else:
1887                        info_type = "String"
1888                        info_num = 0
1889
1890                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
1891                    if info_num != 1:
1892                        type_sql = "VARCHAR"
1893
1894                    # Add field
1895                    added_column = self.add_column(
1896                        table_name=table_variants,
1897                        column_name=info_id_sql,
1898                        column_type=type_sql,
1899                        default_value="null",
1900                        drop=force,
1901                    )
1902
1903                    if added_column:
1904                        added_columns.append(added_column)
1905
1906                    if added_column or force:
1907
1908                        # add field to index
1909                        self.index_additionnal_fields.append(info_id_sql)
1910
1911                        # Update field array
1912                        if connexion_format in ["duckdb"]:
1913                            update_info_field = f"""
1914                            "{info_id_sql}" =
1915                                CASE
1916                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
1917                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
1918                                END
1919                            """
1920                        elif connexion_format in ["sqlite"]:
1921                            update_info_field = f"""
1922                                "{info_id_sql}" =
1923                                    CASE
1924                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
1925                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
1926                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
1927                                    END
1928                            """
1929
1930                        sql_info_alter_table_array.append(update_info_field)
1931
1932            if sql_info_alter_table_array:
1933
1934                # By chromosomes
1935                try:
1936                    chromosomes_list = list(
1937                        self.get_query_to_df(
1938                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
1939                        )["#CHROM"]
1940                    )
1941                except:
1942                    chromosomes_list = [None]
1943
1944                for chrom in chromosomes_list:
1945                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
1946
1947                    # Where clause
1948                    where_clause = ""
1949                    if chrom and len(chromosomes_list) > 1:
1950                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
1951
1952                    # Update table
1953                    if proccess_all_fields_together:
1954                        sql_info_alter_table_array_join = ", ".join(
1955                            sql_info_alter_table_array
1956                        )
1957                        if sql_info_alter_table_array_join:
1958                            sql_info_alter_table = f"""
1959                                UPDATE {table_variants}
1960                                SET {sql_info_alter_table_array_join}
1961                                {where_clause}
1962                                """
1963                            log.debug(
1964                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
1965                            )
1966                            # log.debug(sql_info_alter_table)
1967                            self.conn.execute(sql_info_alter_table)
1968                    else:
1969                        sql_info_alter_num = 0
1970                        for sql_info_alter in sql_info_alter_table_array:
1971                            sql_info_alter_num += 1
1972                            sql_info_alter_table = f"""
1973                                UPDATE {table_variants}
1974                                SET {sql_info_alter}
1975                                {where_clause}
1976                                """
1977                            log.debug(
1978                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
1979                            )
1980                            # log.debug(sql_info_alter_table)
1981                            self.conn.execute(sql_info_alter_table)
1982
1983        # create indexes
1984        if create_index:
1985            self.create_indexes()
1986
1987        return added_columns

The explode_infos function in Python takes a VCF file and explodes the INFO fields into individual columns, returning a list of added columns.

Parameters
  • prefix: The prefix parameter is a string that is used as a prefix for the exploded INFO fields. If the prefix is not provided or is set to None, the function will use the value of self.get_explode_infos_prefix() as the prefix
  • create_index: The create_index parameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set to True, indexes will be created; if set to False, indexes will not be created. The default value is False, defaults to False
  • fields: The fields parameter in the explode_infos function is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded. You can specify the INFO fields you want to explode by passing them as a list to the `
  • force: The force parameter in the explode_infos function is a boolean flag that determines whether to drop and recreate a column if it already exists in the table. If force is set to True, the column will be dropped and recreated. If force is set to `False, defaults to False
  • proccess_all_fields_together: The proccess_all_fields_together parameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set to True, all the INFO fields will be processed together. If set to False, each INFO field will be processed individually. The default value is, defaults to False
  • table: The table parameter in the explode_infos function is used to specify the name of the table where the exploded INFO fields will be added as individual columns. If you provide a value for the table parameter, the function will use that table name. If the table parameter is
Returns

The explode_infos function returns a list of added columns.

def create_indexes(self) -> None:
1989    def create_indexes(self) -> None:
1990        """
1991        Create indexes on the table after insertion
1992        """
1993
1994        # Access
1995        access = self.get_config().get("access", None)
1996
1997        # get table variants
1998        table_variants = self.get_table_variants("FROM")
1999
2000        if self.get_indexing() and access not in ["RO"]:
2001            # Create index
2002            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
2003            self.conn.execute(sql_create_table_index)
2004            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
2005            self.conn.execute(sql_create_table_index)
2006            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
2007            self.conn.execute(sql_create_table_index)
2008            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
2009            self.conn.execute(sql_create_table_index)
2010            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
2011            self.conn.execute(sql_create_table_index)
2012            for field in self.index_additionnal_fields:
2013                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
2014                self.conn.execute(sql_create_table_index)

Create indexes on the table after insertion

def drop_indexes(self) -> None:
2016    def drop_indexes(self) -> None:
2017        """
2018        Create indexes on the table after insertion
2019        """
2020
2021        # Access
2022        access = self.get_config().get("access", None)
2023
2024        # get table variants
2025        table_variants = self.get_table_variants("FROM")
2026
2027        # Get database format
2028        connexion_format = self.get_connexion_format()
2029
2030        if access not in ["RO"]:
2031            if connexion_format in ["duckdb"]:
2032                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
2033            elif connexion_format in ["sqlite"]:
2034                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
2035
2036            list_indexes = self.conn.execute(sql_list_indexes)
2037            index_names = [row[0] for row in list_indexes.fetchall()]
2038            for index in index_names:
2039                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
2040                self.conn.execute(sql_drop_table_index)

Create indexes on the table after insertion

def read_vcf_header(self, f) -> list:
2042    def read_vcf_header(self, f) -> list:
2043        """
2044        It reads the header of a VCF file and returns a list of the header lines
2045
2046        :param f: the file object
2047        :return: The header lines of the VCF file.
2048        """
2049
2050        header_list = []
2051        for line in f:
2052            header_list.append(line)
2053            if line.startswith("#CHROM"):
2054                break
2055        return header_list

It reads the header of a VCF file and returns a list of the header lines

Parameters
  • f: the file object
Returns

The header lines of the VCF file.

def read_vcf_header_file(self, file: str = None) -> list:
2057    def read_vcf_header_file(self, file: str = None) -> list:
2058        """
2059        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
2060        uncompressed files.
2061
2062        :param file: The `file` parameter is a string that represents the path to the VCF header file
2063        that you want to read. It is an optional parameter, so if you don't provide a value, it will
2064        default to `None`
2065        :type file: str
2066        :return: The function `read_vcf_header_file` returns a list.
2067        """
2068
2069        if self.get_input_compressed(input_file=file):
2070            with bgzf.open(file, "rt") as f:
2071                return self.read_vcf_header(f=f)
2072        else:
2073            with open(file, "rt") as f:
2074                return self.read_vcf_header(f=f)

The read_vcf_header_file function reads the header of a VCF file, handling both compressed and uncompressed files.

Parameters
  • file: The file parameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default to None
Returns

The function read_vcf_header_file returns a list.

def execute_query(self, query: str):
2076    def execute_query(self, query: str):
2077        """
2078        It takes a query as an argument, executes it, and returns the results
2079
2080        :param query: The query to be executed
2081        :return: The result of the query is being returned.
2082        """
2083        if query:
2084            return self.conn.execute(query)  # .fetchall()
2085        else:
2086            return None

It takes a query as an argument, executes it, and returns the results

Parameters
  • query: The query to be executed
Returns

The result of the query is being returned.

def export_output( self, output_file: str | None = None, output_header: str | None = None, export_header: bool = True, query: str | None = None, parquet_partitions: list | None = None, chunk_size: int | None = None, threads: int | None = None, sort: bool = False, index: bool = False, order_by: str | None = None, fields_to_rename: dict | None = None) -> bool:
2088    def export_output(
2089        self,
2090        output_file: str | None = None,
2091        output_header: str | None = None,
2092        export_header: bool = True,
2093        query: str | None = None,
2094        parquet_partitions: list | None = None,
2095        chunk_size: int | None = None,
2096        threads: int | None = None,
2097        sort: bool = False,
2098        index: bool = False,
2099        order_by: str | None = None,
2100        fields_to_rename: dict | None = None,
2101    ) -> bool:
2102        """
2103        The `export_output` function exports data from a VCF file to various formats, including VCF,
2104        CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and
2105        partitioning.
2106
2107        :param output_file: The `output_file` parameter is a string that specifies the name of the
2108        output file where the exported data will be saved
2109        :type output_file: str | None
2110        :param output_header: The `output_header` parameter is a string that specifies the name of the
2111        file where the header of the VCF file will be exported. If this parameter is not provided, the
2112        header will be exported to a file with the same name as the `output_file` parameter, but with
2113        the extension "
2114        :type output_header: str | None
2115        :param export_header: The `export_header` parameter is a boolean flag that determines whether
2116        the header of a VCF file should be exported to a separate file or not. If `export_header` is
2117        True, the header will be exported to a file. If `export_header` is False, the header will not
2118        be, defaults to True
2119        :type export_header: bool (optional)
2120        :param query: The `query` parameter in the `export_output` function is an optional SQL query
2121        that can be used to filter and select specific data from the VCF file before exporting it. If
2122        provided, only the data that matches the query will be exported. This allows you to customize
2123        the exported data based on
2124        :type query: str | None
2125        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
2126        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
2127        organize data in a hierarchical directory structure based on the values of one or more columns.
2128        This can improve query performance when working with large datasets
2129        :type parquet_partitions: list | None
2130        :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when
2131        exporting data in Parquet format. This parameter is used for partitioning the Parquet file into
2132        multiple files. It helps in optimizing the export process by breaking down the data into
2133        manageable chunks for processing and storage
2134        :type chunk_size: int | None
2135        :param threads: The `threads` parameter in the `export_output` function specifies the number of
2136        threads to be used during the export process. It determines the level of parallelism and can
2137        improve the performance of the export operation. If this parameter is not provided, the function
2138        will use the default number of threads
2139        :type threads: int | None
2140        :param sort: The `sort` parameter in the `export_output` function is a boolean flag that
2141        determines whether the output file should be sorted based on genomic coordinates of the
2142        variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to
2143        `False`,, defaults to False
2144        :type sort: bool (optional)
2145        :param index: The `index` parameter in the `export_output` function is a boolean flag that
2146        determines whether an index should be created on the output file. If `index` is set to `True`,
2147        an index will be created on the output file. If `index` is set to `False`, no, defaults to False
2148        :type index: bool (optional)
2149        :param order_by: The `order_by` parameter in the `export_output` function is a string that
2150        specifies the column(s) to use for sorting the output file. This parameter is only applicable
2151        when exporting data in VCF format. It allows you to specify the column(s) based on which the
2152        output file should be
2153        :type order_by: str | None
2154        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the
2155        mapping of field names to be renamed during the export process. This parameter allows you to
2156        customize the output field names before exporting the data. Each key-value pair in the
2157        dictionary represents the original field name as the key and the new field name
2158        :type fields_to_rename: dict | None
2159        :return: The `export_output` function returns a boolean value. It checks if the output file
2160        exists and returns True if it does, or None if it doesn't.
2161        """
2162
2163        # Log
2164        log.info("Exporting...")
2165
2166        # Full path
2167        output_file = full_path(output_file)
2168        output_header = full_path(output_header)
2169
2170        # Config
2171        config = self.get_config()
2172
2173        # Param
2174        param = self.get_param()
2175
2176        # Tmp files to remove
2177        tmp_to_remove = []
2178
2179        # If no output, get it
2180        if not output_file:
2181            output_file = self.get_output()
2182
2183        # If not threads
2184        if not threads:
2185            threads = self.get_threads()
2186
2187        # Rename fields
2188        if not fields_to_rename:
2189            fields_to_rename = param.get("export", {}).get("fields_to_rename", None)
2190        self.rename_info_fields(fields_to_rename=fields_to_rename)
2191
2192        # Auto header name with extension
2193        if export_header or output_header:
2194            if not output_header:
2195                output_header = f"{output_file}.hdr"
2196            # Export header
2197            self.export_header(output_file=output_file)
2198
2199        # Switch off export header if VCF output
2200        output_file_type = get_file_format(output_file)
2201        if output_file_type in ["vcf"]:
2202            export_header = False
2203            tmp_to_remove.append(output_header)
2204
2205        # Chunk size
2206        if not chunk_size:
2207            chunk_size = config.get("chunk_size", None)
2208
2209        # Parquet partition
2210        if not parquet_partitions:
2211            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
2212        if parquet_partitions and isinstance(parquet_partitions, str):
2213            parquet_partitions = parquet_partitions.split(",")
2214
2215        # Order by
2216        if not order_by:
2217            order_by = param.get("export", {}).get("order_by", "")
2218
2219        # Header in output
2220        header_in_output = param.get("export", {}).get("include_header", False)
2221
2222        # Database
2223        database_source = self.get_connexion()
2224
2225        # Connexion format
2226        connexion_format = self.get_connexion_format()
2227
2228        # Explode infos
2229        if self.get_explode_infos():
2230            self.explode_infos(
2231                prefix=self.get_explode_infos_prefix(),
2232                fields=self.get_explode_infos_fields(),
2233                force=False,
2234            )
2235
2236        # if connexion_format in ["sqlite"] or query:
2237        if connexion_format in ["sqlite"]:
2238
2239            # Export in Parquet
2240            random_tmp = "".join(
2241                random.choice(string.ascii_lowercase) for i in range(10)
2242            )
2243            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
2244            tmp_to_remove.append(database_source)
2245
2246            # Table Variants
2247            table_variants = self.get_table_variants()
2248
2249            # Create export query
2250            sql_query_export_subquery = f"""
2251                SELECT * FROM {table_variants}
2252                """
2253
2254            # Write source file
2255            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
2256
2257        # Create database
2258        database = Database(
2259            database=database_source,
2260            table="variants",
2261            header_file=output_header,
2262            conn_config=self.get_connexion_config(),
2263        )
2264
2265        # Existing colomns header
2266        existing_columns_header = database.get_header_columns_from_database(query=query)
2267
2268        # Sample list
2269        if output_file_type in ["vcf"]:
2270            get_samples = self.get_samples()
2271            get_samples_check = self.get_samples_check()
2272            samples_force = get_samples is not None
2273            sample_list = self.get_header_sample_list(
2274                check=get_samples_check,
2275                samples=get_samples,
2276                samples_force=samples_force,
2277            )
2278        else:
2279            sample_list = None
2280
2281        # Export file
2282        database.export(
2283            output_database=output_file,
2284            output_header=output_header,
2285            existing_columns_header=existing_columns_header,
2286            parquet_partitions=parquet_partitions,
2287            chunk_size=chunk_size,
2288            threads=threads,
2289            sort=sort,
2290            index=index,
2291            header_in_output=header_in_output,
2292            order_by=order_by,
2293            query=query,
2294            export_header=export_header,
2295            sample_list=sample_list,
2296        )
2297
2298        # Remove
2299        remove_if_exists(tmp_to_remove)
2300
2301        return (os.path.exists(output_file) or None) and (
2302            os.path.exists(output_file) or None
2303        )

The export_output function exports data from a VCF file to various formats, including VCF, CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and partitioning.

Parameters
  • output_file: The output_file parameter is a string that specifies the name of the output file where the exported data will be saved
  • output_header: The output_header parameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as the output_file parameter, but with the extension "
  • export_header: The export_header parameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. If export_header is True, the header will be exported to a file. If export_header is False, the header will not be, defaults to True
  • query: The query parameter in the export_output function is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported. This allows you to customize the exported data based on
  • parquet_partitions: The parquet_partitions parameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets
  • chunk_size: The chunk_size parameter specifies the number of records in a batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files. It helps in optimizing the export process by breaking down the data into manageable chunks for processing and storage
  • threads: The threads parameter in the export_output function specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If this parameter is not provided, the function will use the default number of threads
  • sort: The sort parameter in the export_output function is a boolean flag that determines whether the output file should be sorted based on genomic coordinates of the variants. If sort is set to True, the output file will be sorted. If sort is set to False,, defaults to False
  • index: The index parameter in the export_output function is a boolean flag that determines whether an index should be created on the output file. If index is set to True, an index will be created on the output file. If index is set to False, no, defaults to False
  • order_by: The order_by parameter in the export_output function is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format. It allows you to specify the column(s) based on which the output file should be
  • fields_to_rename: The fields_to_rename parameter is a dictionary that specifies the mapping of field names to be renamed during the export process. This parameter allows you to customize the output field names before exporting the data. Each key-value pair in the dictionary represents the original field name as the key and the new field name
Returns

The export_output function returns a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.

def get_extra_infos(self, table: str = None) -> list:
2305    def get_extra_infos(self, table: str = None) -> list:
2306        """
2307        The `get_extra_infos` function returns a list of columns that are in a specified table but not
2308        in the header.
2309
2310        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
2311        name of the table from which you want to retrieve the extra columns that are not present in the
2312        header. If the `table` parameter is not provided when calling the function, it will default to
2313        using the variants
2314        :type table: str
2315        :return: A list of columns that are in the specified table but not in the header of the table.
2316        """
2317
2318        header_columns = []
2319
2320        if not table:
2321            table = self.get_table_variants(clause="from")
2322            header_columns = self.get_header_columns()
2323
2324        # Check all columns in the database
2325        query = f""" SELECT * FROM {table} LIMIT 1 """
2326        log.debug(f"query {query}")
2327        table_columns = self.get_query_to_df(query).columns.tolist()
2328        extra_columns = []
2329
2330        # Construct extra infos (not in header)
2331        for column in table_columns:
2332            if column not in header_columns:
2333                extra_columns.append(column)
2334
2335        return extra_columns

The get_extra_infos function returns a list of columns that are in a specified table but not in the header.

Parameters
  • table: The table parameter in the get_extra_infos function is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If the table parameter is not provided when calling the function, it will default to using the variants
Returns

A list of columns that are in the specified table but not in the header of the table.

def get_extra_infos_sql(self, table: str = None) -> str:
2337    def get_extra_infos_sql(self, table: str = None) -> str:
2338        """
2339        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
2340        by double quotes
2341
2342        :param table: The name of the table to get the extra infos from. If None, the default table is
2343        used
2344        :type table: str
2345        :return: A string of the extra infos
2346        """
2347
2348        return ", ".join(
2349            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
2350        )

It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes

Parameters
  • table: The name of the table to get the extra infos from. If None, the default table is used
Returns

A string of the extra infos

def export_header( self, header_name: str = None, output_file: str = None, output_file_ext: str = '.hdr', clean_header: bool = True, remove_chrom_line: bool = False) -> str:
2352    def export_header(
2353        self,
2354        header_name: str = None,
2355        output_file: str = None,
2356        output_file_ext: str = ".hdr",
2357        clean_header: bool = True,
2358        remove_chrom_line: bool = False,
2359    ) -> str:
2360        """
2361        The `export_header` function takes a VCF file, extracts the header, modifies it according to
2362        specified options, and writes it to a new file.
2363
2364        :param header_name: The `header_name` parameter is the name of the header file to be created. If
2365        this parameter is not specified, the header will be written to the output file
2366        :type header_name: str
2367        :param output_file: The `output_file` parameter in the `export_header` function is used to
2368        specify the name of the output file where the header will be written. If this parameter is not
2369        provided, the header will be written to a temporary file
2370        :type output_file: str
2371        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
2372        string that represents the extension of the output header file. By default, it is set to ".hdr"
2373        if not specified by the user. This extension will be appended to the `output_file` name to
2374        create the final, defaults to .hdr
2375        :type output_file_ext: str (optional)
2376        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
2377        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
2378        `True`, the function will clean the header by modifying certain lines based on a specific
2379        pattern. If `clean_header`, defaults to True
2380        :type clean_header: bool (optional)
2381        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
2382        boolean flag that determines whether the #CHROM line should be removed from the header before
2383        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
2384        defaults to False
2385        :type remove_chrom_line: bool (optional)
2386        :return: The function `export_header` returns the name of the temporary header file that is
2387        created.
2388        """
2389
2390        if not header_name and not output_file:
2391            output_file = self.get_output()
2392
2393        if self.get_header():
2394
2395            # Get header object
2396            header_obj = self.get_header()
2397
2398            # Create database
2399            db_for_header = Database(database=self.get_input())
2400
2401            # Get real columns in the file
2402            db_header_columns = db_for_header.get_columns()
2403
2404            with tempfile.TemporaryDirectory() as tmpdir:
2405
2406                # Write header file
2407                header_file_tmp = os.path.join(tmpdir, "header")
2408                f = open(header_file_tmp, "w")
2409                vcf.Writer(f, header_obj)
2410                f.close()
2411
2412                # Replace #CHROM line with rel columns
2413                header_list = db_for_header.read_header_file(
2414                    header_file=header_file_tmp
2415                )
2416                header_list[-1] = "\t".join(db_header_columns)
2417
2418                # Remove CHROM line
2419                if remove_chrom_line:
2420                    header_list.pop()
2421
2422                # Clean header
2423                if clean_header:
2424                    header_list_clean = []
2425                    for head in header_list:
2426                        # Clean head for malformed header
2427                        head_clean = head
2428                        head_clean = re.subn(
2429                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
2430                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
2431                            head_clean,
2432                            2,
2433                        )[0]
2434                        # Write header
2435                        header_list_clean.append(head_clean)
2436                    header_list = header_list_clean
2437
2438            tmp_header_name = output_file + output_file_ext
2439
2440            f = open(tmp_header_name, "w")
2441            for line in header_list:
2442                f.write(line)
2443            f.close()
2444
2445        return tmp_header_name

The export_header function takes a VCF file, extracts the header, modifies it according to specified options, and writes it to a new file.

Parameters
  • header_name: The header_name parameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file
  • output_file: The output_file parameter in the export_header function is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file
  • output_file_ext: The output_file_ext parameter in the export_header function is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to the output_file name to create the final, defaults to .hdr
  • clean_header: The clean_header parameter in the export_header function is a boolean flag that determines whether the header should be cleaned or not. When clean_header is set to True, the function will clean the header by modifying certain lines based on a specific pattern. If clean_header, defaults to True
  • remove_chrom_line: The remove_chrom_line parameter in the export_header function is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set to True, the #CHROM line will be removed; if set to `, defaults to False
Returns

The function export_header returns the name of the temporary header file that is created.

def export_variant_vcf( self, vcf_file, remove_info: bool = False, add_samples: bool = True, list_samples: list = [], where_clause: str = '', index: bool = False, threads: int | None = None) -> bool | None:
2447    def export_variant_vcf(
2448        self,
2449        vcf_file,
2450        remove_info: bool = False,
2451        add_samples: bool = True,
2452        list_samples: list = [],
2453        where_clause: str = "",
2454        index: bool = False,
2455        threads: int | None = None,
2456    ) -> bool | None:
2457        """
2458        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
2459        remove INFO field, add samples, and control compression and indexing.
2460
2461        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
2462        written to. It is the output file that will contain the filtered VCF data based on the specified
2463        parameters
2464        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
2465        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
2466        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
2467        in, defaults to False
2468        :type remove_info: bool (optional)
2469        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
2470        the samples should be added to the VCF file or not. If set to True, the samples will be added.
2471        If set to False, the samples will be removed. The default value is True, defaults to True
2472        :type add_samples: bool (optional)
2473        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
2474        in the output VCF file. By default, all samples will be included. If you provide a list of
2475        samples, only those samples will be included in the output file
2476        :type list_samples: list
2477        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
2478        determines whether or not to create an index for the output VCF file. If `index` is set to
2479        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
2480        :type index: bool (optional)
2481        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
2482        number of threads to use for exporting the VCF file. It determines how many parallel threads
2483        will be used during the export process. More threads can potentially speed up the export process
2484        by utilizing multiple cores of the processor. If
2485        :type threads: int | None
2486        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
2487        method with various parameters including the output file, query, threads, sort flag, and index
2488        flag. The `export_output` method is responsible for exporting the VCF data based on the
2489        specified parameters and configurations provided in the `export_variant_vcf` function.
2490        """
2491
2492        # Config
2493        config = self.get_config()
2494
2495        # Extract VCF
2496        log.debug("Export VCF...")
2497
2498        # Table variants
2499        table_variants = self.get_table_variants()
2500
2501        # Threads
2502        if not threads:
2503            threads = self.get_threads()
2504
2505        # Info fields
2506        if remove_info:
2507            if not isinstance(remove_info, str):
2508                remove_info = "."
2509            info_field = f"""'{remove_info}' as INFO"""
2510        else:
2511            info_field = "INFO"
2512
2513        # Samples fields
2514        if add_samples:
2515            if not list_samples:
2516                list_samples = self.get_header_sample_list()
2517            if list_samples:
2518                samples_fields = " , FORMAT , " + " , ".join(
2519                    [f""" "{sample}" """ for sample in list_samples]
2520                )
2521            else:
2522                samples_fields = ""
2523            log.debug(f"samples_fields: {samples_fields}")
2524        else:
2525            samples_fields = ""
2526
2527        # Where clause
2528        if where_clause is None:
2529            where_clause = ""
2530
2531        # Variants
2532        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
2533        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
2534        log.debug(f"sql_query_select={sql_query_select}")
2535
2536        return self.export_output(
2537            output_file=vcf_file,
2538            output_header=None,
2539            export_header=True,
2540            query=sql_query_select,
2541            parquet_partitions=None,
2542            chunk_size=config.get("chunk_size", None),
2543            threads=threads,
2544            sort=True,
2545            index=index,
2546            order_by=None,
2547        )

The export_variant_vcf function exports a VCF file with specified samples, allowing options to remove INFO field, add samples, and control compression and indexing.

Parameters
  • vcf_file: The vcf_file parameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters
  • remove_info: The remove_info parameter in the export_variant_vcf function is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set to True, the INFO field will be removed. If set to False, the INFO field will be included in, defaults to False
  • add_samples: The add_samples parameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True
  • list_samples: The list_samples parameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file
  • index: The index parameter in the export_variant_vcf function is a boolean flag that determines whether or not to create an index for the output VCF file. If index is set to True, the output VCF file will be indexed using tabix. If index, defaults to False
  • threads: The threads parameter in the export_variant_vcf function specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns

The export_variant_vcf function returns the result of calling the export_output method with various parameters including the output file, query, threads, sort flag, and index flag. The export_output method is responsible for exporting the VCF data based on the specified parameters and configurations provided in the export_variant_vcf function.

def run_commands(self, commands: list = [], threads: int = 1) -> None:
2549    def run_commands(self, commands: list = [], threads: int = 1) -> None:
2550        """
2551        It takes a list of commands and runs them in parallel using the number of threads specified
2552
2553        :param commands: A list of commands to run
2554        :param threads: The number of threads to use, defaults to 1 (optional)
2555        """
2556
2557        run_parallel_commands(commands, threads)

It takes a list of commands and runs them in parallel using the number of threads specified

Parameters
  • commands: A list of commands to run
  • threads: The number of threads to use, defaults to 1 (optional)
def get_threads(self, default: int = 1) -> int:
2559    def get_threads(self, default: int = 1) -> int:
2560        """
2561        This function returns the number of threads to use for a job, with a default value of 1 if not
2562        specified.
2563
2564        :param default: The `default` parameter in the `get_threads` method is used to specify the
2565        default number of threads to use if no specific value is provided. If no value is provided for
2566        the `threads` parameter in the configuration or input parameters, the `default` value will be
2567        used, defaults to 1
2568        :type default: int (optional)
2569        :return: the number of threads to use for the current job.
2570        """
2571
2572        # Config
2573        config = self.get_config()
2574
2575        # Param
2576        param = self.get_param()
2577
2578        # Input threads
2579        input_thread = param.get("threads", config.get("threads", None))
2580
2581        # Check threads
2582        if not input_thread:
2583            threads = default
2584        elif int(input_thread) <= 0:
2585            threads = os.cpu_count()
2586        else:
2587            threads = int(input_thread)
2588        return threads

This function returns the number of threads to use for a job, with a default value of 1 if not specified.

Parameters
  • default: The default parameter in the get_threads method is used to specify the default number of threads to use if no specific value is provided. If no value is provided for the threads parameter in the configuration or input parameters, the default value will be used, defaults to 1
Returns

the number of threads to use for the current job.

def get_memory(self, default: str = None) -> str:
2590    def get_memory(self, default: str = None) -> str:
2591        """
2592        This function retrieves the memory value from parameters or configuration with a default value
2593        if not found.
2594
2595        :param default: The `get_memory` function takes in a default value as a string parameter. This
2596        default value is used as a fallback in case the `memory` parameter is not provided in the
2597        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
2598        the function
2599        :type default: str
2600        :return: The `get_memory` function returns a string value representing the memory parameter. If
2601        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
2602        return the default value provided as an argument to the function.
2603        """
2604
2605        # Config
2606        config = self.get_config()
2607
2608        # Param
2609        param = self.get_param()
2610
2611        # Input threads
2612        input_memory = param.get("memory", config.get("memory", None))
2613
2614        # Check threads
2615        if input_memory:
2616            memory = input_memory
2617        else:
2618            memory = default
2619
2620        return memory

This function retrieves the memory value from parameters or configuration with a default value if not found.

Parameters
  • default: The get_memory function takes in a default value as a string parameter. This default value is used as a fallback in case the memory parameter is not provided in the param dictionary or the config dictionary. If memory is not found in either dictionary, the function
Returns

The get_memory function returns a string value representing the memory parameter. If the input_memory is provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.

def update_from_vcf(self, vcf_file: str) -> None:
2622    def update_from_vcf(self, vcf_file: str) -> None:
2623        """
2624        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
2625
2626        :param vcf_file: the path to the VCF file
2627        """
2628
2629        connexion_format = self.get_connexion_format()
2630
2631        if connexion_format in ["duckdb"]:
2632            self.update_from_vcf_duckdb(vcf_file)
2633        elif connexion_format in ["sqlite"]:
2634            self.update_from_vcf_sqlite(vcf_file)

If the database is duckdb, then use the parquet method, otherwise use the sqlite method

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2636    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2637        """
2638        It takes a VCF file and updates the INFO column of the variants table in the database with the
2639        INFO column of the VCF file
2640
2641        :param vcf_file: the path to the VCF file
2642        """
2643
2644        # varaints table
2645        table_variants = self.get_table_variants()
2646
2647        # Loading VCF into temporaire table
2648        skip = self.get_header_length(file=vcf_file)
2649        vcf_df = pd.read_csv(
2650            vcf_file,
2651            sep="\t",
2652            engine="c",
2653            skiprows=skip,
2654            header=0,
2655            low_memory=False,
2656        )
2657        sql_query_update = f"""
2658        UPDATE {table_variants} as table_variants
2659            SET INFO = concat(
2660                            CASE
2661                                WHEN INFO NOT IN ('', '.')
2662                                THEN INFO
2663                                ELSE ''
2664                            END,
2665                            (
2666                                SELECT 
2667                                    concat(
2668                                        CASE
2669                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
2670                                            THEN ';'
2671                                            ELSE ''
2672                                        END
2673                                        ,
2674                                        CASE
2675                                            WHEN table_parquet.INFO NOT IN ('','.')
2676                                            THEN table_parquet.INFO
2677                                            ELSE ''
2678                                        END
2679                                    )
2680                                FROM vcf_df as table_parquet
2681                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
2682                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
2683                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
2684                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
2685                                        AND table_parquet.INFO NOT IN ('','.')
2686                            )
2687                        )
2688            ;
2689            """
2690        self.conn.execute(sql_query_update)

It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2692    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2693        """
2694        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
2695        table, then updates the INFO column of the variants table with the INFO column of the temporary
2696        table
2697
2698        :param vcf_file: The path to the VCF file you want to update the database with
2699        """
2700
2701        # Create a temporary table for the VCF
2702        table_vcf = "tmp_vcf"
2703        sql_create = (
2704            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
2705        )
2706        self.conn.execute(sql_create)
2707
2708        # Loading VCF into temporaire table
2709        vcf_df = pd.read_csv(
2710            vcf_file, sep="\t", comment="#", header=None, low_memory=False
2711        )
2712        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
2713        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
2714
2715        # Update table 'variants' with VCF data
2716        # warning: CONCAT as || operator
2717        sql_query_update = f"""
2718            UPDATE variants as table_variants
2719            SET INFO = CASE
2720                            WHEN INFO NOT IN ('', '.')
2721                            THEN INFO
2722                            ELSE ''
2723                        END ||
2724                        (
2725                        SELECT 
2726                            CASE 
2727                                WHEN table_variants.INFO NOT IN ('','.') 
2728                                    AND table_vcf.INFO NOT IN ('','.')  
2729                                THEN ';' 
2730                                ELSE '' 
2731                            END || 
2732                            CASE 
2733                                WHEN table_vcf.INFO NOT IN ('','.') 
2734                                THEN table_vcf.INFO 
2735                                ELSE '' 
2736                            END
2737                        FROM {table_vcf} as table_vcf
2738                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
2739                            AND table_vcf.\"POS\" = table_variants.\"POS\"
2740                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
2741                            AND table_vcf.\"REF\" = table_variants.\"REF\"
2742                        )
2743        """
2744        self.conn.execute(sql_query_update)
2745
2746        # Drop temporary table
2747        sql_drop = f"DROP TABLE {table_vcf}"
2748        self.conn.execute(sql_drop)

It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table

Parameters
  • vcf_file: The path to the VCF file you want to update the database with
def drop_variants_table(self) -> None:
2750    def drop_variants_table(self) -> None:
2751        """
2752        > This function drops the variants table
2753        """
2754
2755        table_variants = self.get_table_variants()
2756        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
2757        self.conn.execute(sql_table_variants)

This function drops the variants table

def set_variant_id(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2759    def set_variant_id(
2760        self, variant_id_column: str = "variant_id", force: bool = None
2761    ) -> str:
2762        """
2763        It adds a column to the variants table called `variant_id` and populates it with a hash of the
2764        `#CHROM`, `POS`, `REF`, and `ALT` columns
2765
2766        :param variant_id_column: The name of the column to be created in the variants table, defaults
2767        to variant_id
2768        :type variant_id_column: str (optional)
2769        :param force: If True, the variant_id column will be created even if it already exists
2770        :type force: bool
2771        :return: The name of the column that contains the variant_id
2772        """
2773
2774        # Assembly
2775        assembly = self.get_param().get(
2776            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
2777        )
2778
2779        # INFO/Tag prefix
2780        prefix = self.get_explode_infos_prefix()
2781
2782        # Explode INFO/SVTYPE
2783        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
2784
2785        # variants table
2786        table_variants = self.get_table_variants()
2787
2788        # variant_id column
2789        if not variant_id_column:
2790            variant_id_column = "variant_id"
2791
2792        # Creta variant_id column
2793        if "variant_id" not in self.get_extra_infos() or force:
2794
2795            # Create column
2796            self.add_column(
2797                table_name=table_variants,
2798                column_name=variant_id_column,
2799                column_type="UBIGINT",
2800                default_value="0",
2801            )
2802
2803            # Update column
2804            self.conn.execute(
2805                f"""
2806                    UPDATE {table_variants}
2807                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
2808                """
2809            )
2810
2811        # Remove added columns
2812        for added_column in added_columns:
2813            self.drop_column(column=added_column)
2814
2815        # return variant_id column name
2816        return variant_id_column

It adds a column to the variants table called variant_id and populates it with a hash of the #CHROM, POS, REF, and ALT columns

Parameters
  • variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
  • force: If True, the variant_id column will be created even if it already exists
Returns

The name of the column that contains the variant_id

def get_variant_id_column(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2818    def get_variant_id_column(
2819        self, variant_id_column: str = "variant_id", force: bool = None
2820    ) -> str:
2821        """
2822        This function returns the variant_id column name
2823
2824        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
2825        defaults to variant_id
2826        :type variant_id_column: str (optional)
2827        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
2828        False, will only set the variant_id if it is not already set. If None, will set the variant_id
2829        if it is not already set, or if it is set
2830        :type force: bool
2831        :return: The variant_id column name.
2832        """
2833
2834        return self.set_variant_id(variant_id_column=variant_id_column, force=force)

This function returns the variant_id column name

Parameters
  • variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
  • force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns

The variant_id column name.

def scan_databases( self, database_formats: list = ['parquet'], database_releases: list = ['current']) -> dict:
2840    def scan_databases(
2841        self,
2842        database_formats: list = ["parquet"],
2843        database_releases: list = ["current"],
2844    ) -> dict:
2845        """
2846        The function `scan_databases` scans for available databases based on specified formats and
2847        releases.
2848
2849        :param database_formats: The `database_formats` parameter is a list that specifies the formats
2850        of the databases to be scanned. In this case, the accepted format is "parquet"
2851        :type database_formats: list ["parquet"]
2852        :param database_releases: The `database_releases` parameter is a list that specifies the
2853        releases of the databases to be scanned. In the provided function, the default value for
2854        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
2855        databases that are in the "current"
2856        :type database_releases: list
2857        :return: The function `scan_databases` returns a dictionary containing information about
2858        databases that match the specified formats and releases.
2859        """
2860
2861        # Config
2862        config = self.get_config()
2863
2864        # Param
2865        param = self.get_param()
2866
2867        # Param - Assembly
2868        assembly = param.get("assembly", config.get("assembly", None))
2869        if not assembly:
2870            assembly = DEFAULT_ASSEMBLY
2871            log.warning(f"Default assembly '{assembly}'")
2872
2873        # Scan for availabled databases
2874        log.info(
2875            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
2876        )
2877        databases_infos_dict = databases_infos(
2878            database_folder_releases=database_releases,
2879            database_formats=database_formats,
2880            assembly=assembly,
2881            config=config,
2882        )
2883        log.info(
2884            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
2885        )
2886
2887        return databases_infos_dict

The function scan_databases scans for available databases based on specified formats and releases.

Parameters
  • database_formats: The database_formats parameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet"
  • database_releases: The database_releases parameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value for database_releases is set to ["current"], meaning that by default, the function will scan databases that are in the "current"
Returns

The function scan_databases returns a dictionary containing information about databases that match the specified formats and releases.

def annotation(self) -> None:
2889    def annotation(self) -> None:
2890        """
2891        It annotates the VCF file with the annotations specified in the config file.
2892        """
2893
2894        # Config
2895        config = self.get_config()
2896
2897        # Param
2898        param = self.get_param()
2899
2900        # Param - Assembly
2901        assembly = param.get("assembly", config.get("assembly", None))
2902        if not assembly:
2903            assembly = DEFAULT_ASSEMBLY
2904            log.warning(f"Default assembly '{assembly}'")
2905
2906        # annotations databases folders
2907        annotations_databases = set(
2908            config.get("folders", {})
2909            .get("databases", {})
2910            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
2911            + config.get("folders", {})
2912            .get("databases", {})
2913            .get("parquet", ["~/howard/databases/parquet/current"])
2914            + config.get("folders", {})
2915            .get("databases", {})
2916            .get("bcftools", ["~/howard/databases/bcftools/current"])
2917        )
2918
2919        # Get param annotations
2920        if param.get("annotations", None) and isinstance(
2921            param.get("annotations", None), str
2922        ):
2923            log.debug(param.get("annotations", None))
2924            param_annotation_list = param.get("annotations").split(",")
2925        else:
2926            param_annotation_list = []
2927
2928        # Each tools param
2929        if param.get("annotation_parquet", None) != None:
2930            log.debug(
2931                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
2932            )
2933            if isinstance(param.get("annotation_parquet", None), list):
2934                param_annotation_list.append(",".join(param.get("annotation_parquet")))
2935            else:
2936                param_annotation_list.append(param.get("annotation_parquet"))
2937        if param.get("annotation_snpsift", None) != None:
2938            if isinstance(param.get("annotation_snpsift", None), list):
2939                param_annotation_list.append(
2940                    "snpsift:"
2941                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
2942                )
2943            else:
2944                param_annotation_list.append(
2945                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
2946                )
2947        if param.get("annotation_snpeff", None) != None:
2948            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
2949        if param.get("annotation_bcftools", None) != None:
2950            if isinstance(param.get("annotation_bcftools", None), list):
2951                param_annotation_list.append(
2952                    "bcftools:"
2953                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
2954                )
2955            else:
2956                param_annotation_list.append(
2957                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
2958                )
2959        if param.get("annotation_annovar", None) != None:
2960            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
2961        if param.get("annotation_exomiser", None) != None:
2962            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
2963        if param.get("annotation_splice", None) != None:
2964            param_annotation_list.append("splice:" + param.get("annotation_splice"))
2965
2966        # Merge param annotations list
2967        param["annotations"] = ",".join(param_annotation_list)
2968
2969        # debug
2970        log.debug(f"param_annotations={param['annotations']}")
2971
2972        if param.get("annotations"):
2973
2974            # Log
2975            # log.info("Annotations - Check annotation parameters")
2976
2977            if not "annotation" in param:
2978                param["annotation"] = {}
2979
2980            # List of annotations parameters
2981            annotations_list_input = {}
2982            if isinstance(param.get("annotations", None), str):
2983                annotation_file_list = [
2984                    value for value in param.get("annotations", "").split(",")
2985                ]
2986                for annotation_file in annotation_file_list:
2987                    annotations_list_input[annotation_file.strip()] = {"INFO": None}
2988            else:
2989                annotations_list_input = param.get("annotations", {})
2990
2991            log.info(f"Quick Annotations:")
2992            for annotation_key in list(annotations_list_input.keys()):
2993                log.info(f"   {annotation_key}")
2994
2995            # List of annotations and associated fields
2996            annotations_list = {}
2997
2998            for annotation_file in annotations_list_input:
2999
3000                # Explode annotations if ALL
3001                if (
3002                    annotation_file.upper() == "ALL"
3003                    or annotation_file.upper().startswith("ALL:")
3004                ):
3005
3006                    # check ALL parameters (formats, releases)
3007                    annotation_file_split = annotation_file.split(":")
3008                    database_formats = "parquet"
3009                    database_releases = "current"
3010                    for annotation_file_option in annotation_file_split[1:]:
3011                        database_all_options_split = annotation_file_option.split("=")
3012                        if database_all_options_split[0] == "format":
3013                            database_formats = database_all_options_split[1].split("+")
3014                        if database_all_options_split[0] == "release":
3015                            database_releases = database_all_options_split[1].split("+")
3016
3017                    # Scan for availabled databases
3018                    databases_infos_dict = self.scan_databases(
3019                        database_formats=database_formats,
3020                        database_releases=database_releases,
3021                    )
3022
3023                    # Add found databases in annotation parameters
3024                    for database_infos in databases_infos_dict.keys():
3025                        annotations_list[database_infos] = {"INFO": None}
3026
3027                else:
3028                    annotations_list[annotation_file] = annotations_list_input[
3029                        annotation_file
3030                    ]
3031
3032            # Check each databases
3033            if len(annotations_list):
3034
3035                log.info(
3036                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
3037                )
3038
3039                for annotation_file in annotations_list:
3040
3041                    # Init
3042                    annotations = annotations_list.get(annotation_file, None)
3043
3044                    # Annotation snpEff
3045                    if annotation_file.startswith("snpeff"):
3046
3047                        log.debug(f"Quick Annotation snpEff")
3048
3049                        if "snpeff" not in param["annotation"]:
3050                            param["annotation"]["snpeff"] = {}
3051
3052                        if "options" not in param["annotation"]["snpeff"]:
3053                            param["annotation"]["snpeff"]["options"] = ""
3054
3055                        # snpEff options in annotations
3056                        param["annotation"]["snpeff"]["options"] = "".join(
3057                            annotation_file.split(":")[1:]
3058                        )
3059
3060                    # Annotation Annovar
3061                    elif annotation_file.startswith("annovar"):
3062
3063                        log.debug(f"Quick Annotation Annovar")
3064
3065                        if "annovar" not in param["annotation"]:
3066                            param["annotation"]["annovar"] = {}
3067
3068                        if "annotations" not in param["annotation"]["annovar"]:
3069                            param["annotation"]["annovar"]["annotations"] = {}
3070
3071                        # Options
3072                        annotation_file_split = annotation_file.split(":")
3073                        for annotation_file_annotation in annotation_file_split[1:]:
3074                            if annotation_file_annotation:
3075                                param["annotation"]["annovar"]["annotations"][
3076                                    annotation_file_annotation
3077                                ] = annotations
3078
3079                    # Annotation Exomiser
3080                    elif annotation_file.startswith("exomiser"):
3081
3082                        log.debug(f"Quick Annotation Exomiser")
3083
3084                        param["annotation"]["exomiser"] = params_string_to_dict(
3085                            annotation_file
3086                        )
3087
3088                    # Annotation Splice
3089                    elif annotation_file.startswith("splice"):
3090
3091                        log.debug(f"Quick Annotation Splice")
3092
3093                        param["annotation"]["splice"] = params_string_to_dict(
3094                            annotation_file
3095                        )
3096
3097                    # Annotation Parquet or BCFTOOLS
3098                    else:
3099
3100                        # Tools detection
3101                        if annotation_file.startswith("bcftools:"):
3102                            annotation_tool_initial = "bcftools"
3103                            annotation_file = ":".join(annotation_file.split(":")[1:])
3104                        elif annotation_file.startswith("snpsift:"):
3105                            annotation_tool_initial = "snpsift"
3106                            annotation_file = ":".join(annotation_file.split(":")[1:])
3107                        elif annotation_file.startswith("bigwig:"):
3108                            annotation_tool_initial = "bigwig"
3109                            annotation_file = ":".join(annotation_file.split(":")[1:])
3110                        else:
3111                            annotation_tool_initial = None
3112
3113                        # list of files
3114                        annotation_file_list = annotation_file.replace("+", ":").split(
3115                            ":"
3116                        )
3117
3118                        for annotation_file in annotation_file_list:
3119
3120                            if annotation_file:
3121
3122                                # Annotation tool initial
3123                                annotation_tool = annotation_tool_initial
3124
3125                                # Find file
3126                                annotation_file_found = None
3127
3128                                if os.path.exists(annotation_file):
3129                                    annotation_file_found = annotation_file
3130                                elif os.path.exists(full_path(annotation_file)):
3131                                    annotation_file_found = full_path(annotation_file)
3132                                else:
3133                                    # Find within assembly folders
3134                                    for annotations_database in annotations_databases:
3135                                        found_files = find_all(
3136                                            annotation_file,
3137                                            os.path.join(
3138                                                annotations_database, assembly
3139                                            ),
3140                                        )
3141                                        if len(found_files) > 0:
3142                                            annotation_file_found = found_files[0]
3143                                            break
3144                                    if not annotation_file_found and not assembly:
3145                                        # Find within folders
3146                                        for (
3147                                            annotations_database
3148                                        ) in annotations_databases:
3149                                            found_files = find_all(
3150                                                annotation_file, annotations_database
3151                                            )
3152                                            if len(found_files) > 0:
3153                                                annotation_file_found = found_files[0]
3154                                                break
3155                                log.debug(
3156                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
3157                                )
3158
3159                                # Full path
3160                                annotation_file_found = full_path(annotation_file_found)
3161
3162                                if annotation_file_found:
3163
3164                                    database = Database(database=annotation_file_found)
3165                                    quick_annotation_format = database.get_format()
3166                                    quick_annotation_is_compressed = (
3167                                        database.is_compressed()
3168                                    )
3169                                    quick_annotation_is_indexed = os.path.exists(
3170                                        f"{annotation_file_found}.tbi"
3171                                    )
3172                                    bcftools_preference = False
3173
3174                                    # Check Annotation Tool
3175                                    if not annotation_tool:
3176                                        if (
3177                                            bcftools_preference
3178                                            and quick_annotation_format
3179                                            in ["vcf", "bed"]
3180                                            and quick_annotation_is_compressed
3181                                            and quick_annotation_is_indexed
3182                                        ):
3183                                            annotation_tool = "bcftools"
3184                                        elif quick_annotation_format in [
3185                                            "vcf",
3186                                            "bed",
3187                                            "tsv",
3188                                            "tsv",
3189                                            "csv",
3190                                            "json",
3191                                            "tbl",
3192                                            "parquet",
3193                                            "duckdb",
3194                                        ]:
3195                                            annotation_tool = "parquet"
3196                                        elif quick_annotation_format in ["bw"]:
3197                                            annotation_tool = "bigwig"
3198                                        else:
3199                                            log.error(
3200                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3201                                            )
3202                                            raise ValueError(
3203                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3204                                            )
3205
3206                                    log.debug(
3207                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
3208                                    )
3209
3210                                    # Annotation Tool dispatch
3211                                    if annotation_tool:
3212                                        if annotation_tool not in param["annotation"]:
3213                                            param["annotation"][annotation_tool] = {}
3214                                        if (
3215                                            "annotations"
3216                                            not in param["annotation"][annotation_tool]
3217                                        ):
3218                                            param["annotation"][annotation_tool][
3219                                                "annotations"
3220                                            ] = {}
3221                                        param["annotation"][annotation_tool][
3222                                            "annotations"
3223                                        ][annotation_file_found] = annotations
3224
3225                                else:
3226                                    log.warning(
3227                                        f"Quick Annotation File {annotation_file} does NOT exist"
3228                                    )
3229
3230                self.set_param(param)
3231
3232        if param.get("annotation", None):
3233            log.info("Annotations")
3234            if param.get("annotation", {}).get("parquet", None):
3235                log.info("Annotations 'parquet'...")
3236                self.annotation_parquet()
3237            if param.get("annotation", {}).get("bcftools", None):
3238                log.info("Annotations 'bcftools'...")
3239                self.annotation_bcftools()
3240            if param.get("annotation", {}).get("snpsift", None):
3241                log.info("Annotations 'snpsift'...")
3242                self.annotation_snpsift()
3243            if param.get("annotation", {}).get("bigwig", None):
3244                log.info("Annotations 'bigwig'...")
3245                self.annotation_bigwig()
3246            if param.get("annotation", {}).get("annovar", None):
3247                log.info("Annotations 'annovar'...")
3248                self.annotation_annovar()
3249            if param.get("annotation", {}).get("snpeff", None):
3250                log.info("Annotations 'snpeff'...")
3251                self.annotation_snpeff()
3252            if param.get("annotation", {}).get("exomiser", None) is not None:
3253                log.info("Annotations 'exomiser'...")
3254                self.annotation_exomiser()
3255            if param.get("annotation", {}).get("splice", None) is not None:
3256                log.info("Annotations 'splice' ...")
3257                self.annotation_splice()
3258
3259        # Explode INFOS fields into table fields
3260        if self.get_explode_infos():
3261            self.explode_infos(
3262                prefix=self.get_explode_infos_prefix(),
3263                fields=self.get_explode_infos_fields(),
3264                force=True,
3265            )

It annotates the VCF file with the annotations specified in the config file.

def annotation_bigwig(self, threads: int = None) -> None:
3267    def annotation_bigwig(self, threads: int = None) -> None:
3268        """
3269        The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases.
3270
3271        :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the
3272        number of threads to be used for parallel processing during the annotation process. If the
3273        `threads` parameter is not provided, the method will attempt to determine the optimal number of
3274        threads to use based on the system configuration
3275        :type threads: int
3276        :return: True
3277        """
3278
3279        # DEBUG
3280        log.debug("Start annotation with bigwig databases")
3281
3282        # # Threads
3283        # if not threads:
3284        #     threads = self.get_threads()
3285        # log.debug("Threads: " + str(threads))
3286
3287        # Config
3288        config = self.get_config()
3289        log.debug("Config: " + str(config))
3290
3291        # Config - BCFTools databases folders
3292        databases_folders = set(
3293            self.get_config()
3294            .get("folders", {})
3295            .get("databases", {})
3296            .get("annotations", ["."])
3297            + self.get_config()
3298            .get("folders", {})
3299            .get("databases", {})
3300            .get("bigwig", ["."])
3301        )
3302        log.debug("Databases annotations: " + str(databases_folders))
3303
3304        # Param
3305        annotations = (
3306            self.get_param()
3307            .get("annotation", {})
3308            .get("bigwig", {})
3309            .get("annotations", None)
3310        )
3311        log.debug("Annotations: " + str(annotations))
3312
3313        # Assembly
3314        assembly = self.get_param().get(
3315            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3316        )
3317
3318        # Data
3319        table_variants = self.get_table_variants()
3320
3321        # Check if not empty
3322        log.debug("Check if not empty")
3323        sql_query_chromosomes = (
3324            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3325        )
3326        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3327        if not sql_query_chromosomes_df["count"][0]:
3328            log.info(f"VCF empty")
3329            return
3330
3331        # VCF header
3332        vcf_reader = self.get_header()
3333        log.debug("Initial header: " + str(vcf_reader.infos))
3334
3335        # Existing annotations
3336        for vcf_annotation in self.get_header().infos:
3337
3338            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3339            log.debug(
3340                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3341            )
3342
3343        if annotations:
3344
3345            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3346
3347                # Export VCF file
3348                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3349
3350                # annotation_bigwig_config
3351                annotation_bigwig_config_list = []
3352
3353                for annotation in annotations:
3354                    annotation_fields = annotations[annotation]
3355
3356                    # Annotation Name
3357                    annotation_name = os.path.basename(annotation)
3358
3359                    if not annotation_fields:
3360                        annotation_fields = {"INFO": None}
3361
3362                    log.debug(f"Annotation '{annotation_name}'")
3363                    log.debug(
3364                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3365                    )
3366
3367                    # Create Database
3368                    database = Database(
3369                        database=annotation,
3370                        databases_folders=databases_folders,
3371                        assembly=assembly,
3372                    )
3373
3374                    # Find files
3375                    db_file = database.get_database()
3376                    db_file = full_path(db_file)
3377                    db_hdr_file = database.get_header_file()
3378                    db_hdr_file = full_path(db_hdr_file)
3379                    db_file_type = database.get_format()
3380
3381                    # If db_file is http ?
3382                    if database.get_database().startswith("http"):
3383
3384                        # Datbase is HTTP URL
3385                        db_file_is_http = True
3386
3387                        # DB file keep as URL
3388                        db_file = database.get_database()
3389                        log.warning(
3390                            f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)"
3391                        )
3392
3393                        # Retrieve automatic annotation field name
3394                        annotation_field = clean_annotation_field(
3395                            os.path.basename(db_file).replace(".bw", "")
3396                        )
3397                        log.debug(
3398                            f"Create header file with annotation field '{annotation_field}' is an HTTP URL"
3399                        )
3400
3401                        # Create automatic header file
3402                        db_hdr_file = os.path.join(tmp_dir, "header.hdr")
3403                        with open(db_hdr_file, "w") as f:
3404                            f.write("##fileformat=VCFv4.2\n")
3405                            f.write(
3406                                f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n"""
3407                            )
3408                            f.write(f"#CHROM	START	END	{annotation_field}\n")
3409
3410                    else:
3411
3412                        # Datbase is NOT HTTP URL
3413                        db_file_is_http = False
3414
3415                    # Check index - try to create if not exists
3416                    if (
3417                        db_file is None
3418                        or db_hdr_file is None
3419                        or (not os.path.exists(db_file) and not db_file_is_http)
3420                        or not os.path.exists(db_hdr_file)
3421                        or not db_file_type in ["bw"]
3422                    ):
3423                        # if False:
3424                        log.error("Annotation failed: database not valid")
3425                        log.error(f"Annotation annotation file: {db_file}")
3426                        log.error(f"Annotation annotation file type: {db_file_type}")
3427                        log.error(f"Annotation annotation header: {db_hdr_file}")
3428                        raise ValueError(
3429                            f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}"
3430                        )
3431                    else:
3432
3433                        # Log
3434                        log.debug(
3435                            f"Annotation '{annotation}' - file: "
3436                            + str(db_file)
3437                            + " and "
3438                            + str(db_hdr_file)
3439                        )
3440
3441                        # Load header as VCF object
3442                        db_hdr_vcf = Variants(input=db_hdr_file)
3443                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3444                        log.debug(
3445                            "Annotation database header: "
3446                            + str(db_hdr_vcf_header_infos)
3447                        )
3448
3449                        # For all fields in database
3450                        annotation_fields_full = False
3451                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3452                            annotation_fields = {
3453                                key: key for key in db_hdr_vcf_header_infos
3454                            }
3455                            log.debug(
3456                                "Annotation database header - All annotations added: "
3457                                + str(annotation_fields)
3458                            )
3459                            annotation_fields_full = True
3460
3461                        # Init
3462                        cyvcf2_header_rename_dict = {}
3463                        cyvcf2_header_list = []
3464                        cyvcf2_header_indexes = {}
3465
3466                        # process annotation fields
3467                        for annotation_field in annotation_fields:
3468
3469                            # New annotation name
3470                            annotation_field_new = annotation_fields[annotation_field]
3471
3472                            # Check annotation field and index in header
3473                            if (
3474                                annotation_field
3475                                in db_hdr_vcf.get_header_columns_as_list()
3476                            ):
3477                                annotation_field_index = (
3478                                    db_hdr_vcf.get_header_columns_as_list().index(
3479                                        annotation_field
3480                                    )
3481                                    - 3
3482                                )
3483                                cyvcf2_header_indexes[annotation_field_new] = (
3484                                    annotation_field_index
3485                                )
3486                            else:
3487                                msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'"
3488                                log.error(msg_err)
3489                                raise ValueError(msg_err)
3490
3491                            # Append annotation field in cyvcf2 header list
3492                            cyvcf2_header_rename_dict[annotation_field_new] = (
3493                                db_hdr_vcf_header_infos[annotation_field].id
3494                            )
3495                            cyvcf2_header_list.append(
3496                                {
3497                                    "ID": annotation_field_new,
3498                                    "Number": db_hdr_vcf_header_infos[
3499                                        annotation_field
3500                                    ].num,
3501                                    "Type": db_hdr_vcf_header_infos[
3502                                        annotation_field
3503                                    ].type,
3504                                    "Description": db_hdr_vcf_header_infos[
3505                                        annotation_field
3506                                    ].desc,
3507                                }
3508                            )
3509
3510                            # Add header on VCF
3511                            vcf_reader.infos[annotation_field_new] = vcf.parser._Info(
3512                                annotation_field_new,
3513                                db_hdr_vcf_header_infos[annotation_field].num,
3514                                db_hdr_vcf_header_infos[annotation_field].type,
3515                                db_hdr_vcf_header_infos[annotation_field].desc,
3516                                "HOWARD BigWig annotation",
3517                                "unknown",
3518                                self.code_type_map[
3519                                    db_hdr_vcf_header_infos[annotation_field].type
3520                                ],
3521                            )
3522
3523                        # Load bigwig database
3524                        bw_db = pyBigWig.open(db_file)
3525                        if bw_db.isBigWig():
3526                            log.debug(f"Database '{db_file}' is in 'BigWig' format")
3527                        else:
3528                            msg_err = f"Database '{db_file}' is NOT in 'BigWig' format"
3529                            log.error(msg_err)
3530                            raise ValueError(msg_err)
3531
3532                        annotation_bigwig_config_list.append(
3533                            {
3534                                "db_file": db_file,
3535                                "bw_db": bw_db,
3536                                "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict,
3537                                "cyvcf2_header_list": cyvcf2_header_list,
3538                                "cyvcf2_header_indexes": cyvcf2_header_indexes,
3539                            }
3540                        )
3541
3542                # Annotate
3543                if annotation_bigwig_config_list:
3544
3545                    # Annotation config
3546                    log.debug(
3547                        f"annotation_bigwig_config={annotation_bigwig_config_list}"
3548                    )
3549
3550                    # Export VCF file
3551                    self.export_variant_vcf(
3552                        vcf_file=tmp_vcf_name,
3553                        remove_info=True,
3554                        add_samples=False,
3555                        index=True,
3556                    )
3557
3558                    # Load input tmp file
3559                    input_vcf = cyvcf2.VCF(tmp_vcf_name)
3560
3561                    # Add header in input file
3562                    for annotation_bigwig_config in annotation_bigwig_config_list:
3563                        for cyvcf2_header_field in annotation_bigwig_config.get(
3564                            "cyvcf2_header_list", []
3565                        ):
3566                            log.info(
3567                                f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'"
3568                            )
3569                            input_vcf.add_info_to_header(cyvcf2_header_field)
3570
3571                    # Create output VCF file
3572                    output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz")
3573                    output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf)
3574
3575                    # Fetch variants
3576                    log.info(f"Annotations 'bigwig' start...")
3577                    for variant in input_vcf:
3578
3579                        for annotation_bigwig_config in annotation_bigwig_config_list:
3580
3581                            # DB and indexes
3582                            bw_db = annotation_bigwig_config.get("bw_db", None)
3583                            cyvcf2_header_indexes = annotation_bigwig_config.get(
3584                                "cyvcf2_header_indexes", None
3585                            )
3586
3587                            # Retrieve value from chrom pos
3588                            res = bw_db.values(
3589                                variant.CHROM, variant.POS - 1, variant.POS
3590                            )
3591
3592                            # For each annotation fields (and indexes)
3593                            for cyvcf2_header_index in cyvcf2_header_indexes:
3594
3595                                # If value is NOT nNone
3596                                if not np.isnan(
3597                                    res[cyvcf2_header_indexes[cyvcf2_header_index]]
3598                                ):
3599                                    variant.INFO[cyvcf2_header_index] = res[
3600                                        cyvcf2_header_indexes[cyvcf2_header_index]
3601                                    ]
3602
3603                        # Add record in output file
3604                        output_vcf.write_record(variant)
3605
3606                    # Log
3607                    log.debug(f"Annotation done.")
3608
3609                    # Close and write file
3610                    log.info(f"Annotations 'bigwig' write...")
3611                    output_vcf.close()
3612                    log.debug(f"Write done.")
3613
3614                    # Update variants
3615                    log.info(f"Annotations 'bigwig' update...")
3616                    self.update_from_vcf(output_vcf_file)
3617                    log.debug(f"Update done.")
3618
3619        return True

The function annotation_bigwig annotates variants in a VCF file using bigwig databases.

Parameters
  • threads: The threads parameter in the annotation_bigwig method is used to specify the number of threads to be used for parallel processing during the annotation process. If the threads parameter is not provided, the method will attempt to determine the optimal number of threads to use based on the system configuration
Returns

True

def annotation_snpsift(self, threads: int = None) -> None:
3621    def annotation_snpsift(self, threads: int = None) -> None:
3622        """
3623        This function annotate with bcftools
3624
3625        :param threads: Number of threads to use
3626        :return: the value of the variable "return_value".
3627        """
3628
3629        # DEBUG
3630        log.debug("Start annotation with bcftools databases")
3631
3632        # Threads
3633        if not threads:
3634            threads = self.get_threads()
3635        log.debug("Threads: " + str(threads))
3636
3637        # Config
3638        config = self.get_config()
3639        log.debug("Config: " + str(config))
3640
3641        # Config - snpSift
3642        snpsift_bin_command = get_bin_command(
3643            bin="SnpSift.jar",
3644            tool="snpsift",
3645            bin_type="jar",
3646            config=config,
3647            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
3648        )
3649        if not snpsift_bin_command:
3650            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
3651            log.error(msg_err)
3652            raise ValueError(msg_err)
3653
3654        # Config - bcftools
3655        bcftools_bin_command = get_bin_command(
3656            bin="bcftools",
3657            tool="bcftools",
3658            bin_type="bin",
3659            config=config,
3660            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3661        )
3662        if not bcftools_bin_command:
3663            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3664            log.error(msg_err)
3665            raise ValueError(msg_err)
3666
3667        # Config - BCFTools databases folders
3668        databases_folders = set(
3669            self.get_config()
3670            .get("folders", {})
3671            .get("databases", {})
3672            .get("annotations", ["."])
3673            + self.get_config()
3674            .get("folders", {})
3675            .get("databases", {})
3676            .get("bcftools", ["."])
3677        )
3678        log.debug("Databases annotations: " + str(databases_folders))
3679
3680        # Param
3681        annotations = (
3682            self.get_param()
3683            .get("annotation", {})
3684            .get("snpsift", {})
3685            .get("annotations", None)
3686        )
3687        log.debug("Annotations: " + str(annotations))
3688
3689        # Assembly
3690        assembly = self.get_param().get(
3691            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3692        )
3693
3694        # Data
3695        table_variants = self.get_table_variants()
3696
3697        # Check if not empty
3698        log.debug("Check if not empty")
3699        sql_query_chromosomes = (
3700            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3701        )
3702        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3703        if not sql_query_chromosomes_df["count"][0]:
3704            log.info(f"VCF empty")
3705            return
3706
3707        # VCF header
3708        vcf_reader = self.get_header()
3709        log.debug("Initial header: " + str(vcf_reader.infos))
3710
3711        # Existing annotations
3712        for vcf_annotation in self.get_header().infos:
3713
3714            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3715            log.debug(
3716                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3717            )
3718
3719        if annotations:
3720
3721            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3722
3723                # Export VCF file
3724                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3725
3726                # Init
3727                commands = {}
3728
3729                for annotation in annotations:
3730                    annotation_fields = annotations[annotation]
3731
3732                    # Annotation Name
3733                    annotation_name = os.path.basename(annotation)
3734
3735                    if not annotation_fields:
3736                        annotation_fields = {"INFO": None}
3737
3738                    log.debug(f"Annotation '{annotation_name}'")
3739                    log.debug(
3740                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3741                    )
3742
3743                    # Create Database
3744                    database = Database(
3745                        database=annotation,
3746                        databases_folders=databases_folders,
3747                        assembly=assembly,
3748                    )
3749
3750                    # Find files
3751                    db_file = database.get_database()
3752                    db_file = full_path(db_file)
3753                    db_hdr_file = database.get_header_file()
3754                    db_hdr_file = full_path(db_hdr_file)
3755                    db_file_type = database.get_format()
3756                    db_tbi_file = f"{db_file}.tbi"
3757                    db_file_compressed = database.is_compressed()
3758
3759                    # Check if compressed
3760                    if not db_file_compressed:
3761                        log.error(
3762                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3763                        )
3764                        raise ValueError(
3765                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3766                        )
3767
3768                    # Check if indexed
3769                    if not os.path.exists(db_tbi_file):
3770                        log.error(
3771                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3772                        )
3773                        raise ValueError(
3774                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3775                        )
3776
3777                    # Check index - try to create if not exists
3778                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3779                        log.error("Annotation failed: database not valid")
3780                        log.error(f"Annotation annotation file: {db_file}")
3781                        log.error(f"Annotation annotation header: {db_hdr_file}")
3782                        log.error(f"Annotation annotation index: {db_tbi_file}")
3783                        raise ValueError(
3784                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3785                        )
3786                    else:
3787
3788                        log.debug(
3789                            f"Annotation '{annotation}' - file: "
3790                            + str(db_file)
3791                            + " and "
3792                            + str(db_hdr_file)
3793                        )
3794
3795                        # Load header as VCF object
3796                        db_hdr_vcf = Variants(input=db_hdr_file)
3797                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3798                        log.debug(
3799                            "Annotation database header: "
3800                            + str(db_hdr_vcf_header_infos)
3801                        )
3802
3803                        # For all fields in database
3804                        annotation_fields_full = False
3805                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3806                            annotation_fields = {
3807                                key: key for key in db_hdr_vcf_header_infos
3808                            }
3809                            log.debug(
3810                                "Annotation database header - All annotations added: "
3811                                + str(annotation_fields)
3812                            )
3813                            annotation_fields_full = True
3814
3815                        # # Create file for field rename
3816                        # log.debug("Create file for field rename")
3817                        # tmp_rename = NamedTemporaryFile(
3818                        #     prefix=self.get_prefix(),
3819                        #     dir=self.get_tmp_dir(),
3820                        #     suffix=".rename",
3821                        #     delete=False,
3822                        # )
3823                        # tmp_rename_name = tmp_rename.name
3824                        # tmp_files.append(tmp_rename_name)
3825
3826                        # Number of fields
3827                        nb_annotation_field = 0
3828                        annotation_list = []
3829                        annotation_infos_rename_list = []
3830
3831                        for annotation_field in annotation_fields:
3832
3833                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3834                            annotation_fields_new_name = annotation_fields.get(
3835                                annotation_field, annotation_field
3836                            )
3837                            if not annotation_fields_new_name:
3838                                annotation_fields_new_name = annotation_field
3839
3840                            # Check if field is in DB and if field is not elready in input data
3841                            if (
3842                                annotation_field in db_hdr_vcf.get_header().infos
3843                                and annotation_fields_new_name
3844                                not in self.get_header().infos
3845                            ):
3846
3847                                log.info(
3848                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3849                                )
3850
3851                                # BCFTools annotate param to rename fields
3852                                if annotation_field != annotation_fields_new_name:
3853                                    annotation_infos_rename_list.append(
3854                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3855                                    )
3856
3857                                # Add INFO field to header
3858                                db_hdr_vcf_header_infos_number = (
3859                                    db_hdr_vcf_header_infos[annotation_field].num or "."
3860                                )
3861                                db_hdr_vcf_header_infos_type = (
3862                                    db_hdr_vcf_header_infos[annotation_field].type
3863                                    or "String"
3864                                )
3865                                db_hdr_vcf_header_infos_description = (
3866                                    db_hdr_vcf_header_infos[annotation_field].desc
3867                                    or f"{annotation_field} description"
3868                                )
3869                                db_hdr_vcf_header_infos_source = (
3870                                    db_hdr_vcf_header_infos[annotation_field].source
3871                                    or "unknown"
3872                                )
3873                                db_hdr_vcf_header_infos_version = (
3874                                    db_hdr_vcf_header_infos[annotation_field].version
3875                                    or "unknown"
3876                                )
3877
3878                                vcf_reader.infos[annotation_fields_new_name] = (
3879                                    vcf.parser._Info(
3880                                        annotation_fields_new_name,
3881                                        db_hdr_vcf_header_infos_number,
3882                                        db_hdr_vcf_header_infos_type,
3883                                        db_hdr_vcf_header_infos_description,
3884                                        db_hdr_vcf_header_infos_source,
3885                                        db_hdr_vcf_header_infos_version,
3886                                        self.code_type_map[
3887                                            db_hdr_vcf_header_infos_type
3888                                        ],
3889                                    )
3890                                )
3891
3892                                annotation_list.append(annotation_field)
3893
3894                                nb_annotation_field += 1
3895
3896                            else:
3897
3898                                if (
3899                                    annotation_field
3900                                    not in db_hdr_vcf.get_header().infos
3901                                ):
3902                                    log.warning(
3903                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
3904                                    )
3905                                if (
3906                                    annotation_fields_new_name
3907                                    in self.get_header().infos
3908                                ):
3909                                    log.warning(
3910                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
3911                                    )
3912
3913                        log.info(
3914                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3915                        )
3916
3917                        annotation_infos = ",".join(annotation_list)
3918
3919                        if annotation_infos != "":
3920
3921                            # Annotated VCF (and error file)
3922                            tmp_annotation_vcf_name = os.path.join(
3923                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
3924                            )
3925                            tmp_annotation_vcf_name_err = (
3926                                tmp_annotation_vcf_name + ".err"
3927                            )
3928
3929                            # Add fields to annotate
3930                            if not annotation_fields_full:
3931                                annotation_infos_option = f"-info {annotation_infos}"
3932                            else:
3933                                annotation_infos_option = ""
3934
3935                            # Info fields rename
3936                            if annotation_infos_rename_list:
3937                                annotation_infos_rename = " -c " + ",".join(
3938                                    annotation_infos_rename_list
3939                                )
3940                            else:
3941                                annotation_infos_rename = ""
3942
3943                            # Annotate command
3944                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3945
3946                            # Add command
3947                            commands[command_annotate] = tmp_annotation_vcf_name
3948
3949                if commands:
3950
3951                    # Export VCF file
3952                    self.export_variant_vcf(
3953                        vcf_file=tmp_vcf_name,
3954                        remove_info=True,
3955                        add_samples=False,
3956                        index=True,
3957                    )
3958                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
3959
3960                    # Num command
3961                    nb_command = 0
3962
3963                    # Annotate
3964                    for command_annotate in commands:
3965                        nb_command += 1
3966                        log.info(
3967                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
3968                        )
3969                        log.debug(f"command_annotate={command_annotate}")
3970                        run_parallel_commands([command_annotate], threads)
3971
3972                        # Debug
3973                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
3974
3975                        # Update variants
3976                        log.info(
3977                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
3978                        )
3979                        self.update_from_vcf(commands[command_annotate])

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_bcftools(self, threads: int = None) -> None:
3981    def annotation_bcftools(self, threads: int = None) -> None:
3982        """
3983        This function annotate with bcftools
3984
3985        :param threads: Number of threads to use
3986        :return: the value of the variable "return_value".
3987        """
3988
3989        # DEBUG
3990        log.debug("Start annotation with bcftools databases")
3991
3992        # Threads
3993        if not threads:
3994            threads = self.get_threads()
3995        log.debug("Threads: " + str(threads))
3996
3997        # Config
3998        config = self.get_config()
3999        log.debug("Config: " + str(config))
4000
4001        # DEBUG
4002        delete_tmp = True
4003        if self.get_config().get("verbosity", "warning") in ["debug"]:
4004            delete_tmp = False
4005            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4006
4007        # Config - BCFTools bin command
4008        bcftools_bin_command = get_bin_command(
4009            bin="bcftools",
4010            tool="bcftools",
4011            bin_type="bin",
4012            config=config,
4013            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
4014        )
4015        if not bcftools_bin_command:
4016            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
4017            log.error(msg_err)
4018            raise ValueError(msg_err)
4019
4020        # Config - BCFTools databases folders
4021        databases_folders = set(
4022            self.get_config()
4023            .get("folders", {})
4024            .get("databases", {})
4025            .get("annotations", ["."])
4026            + self.get_config()
4027            .get("folders", {})
4028            .get("databases", {})
4029            .get("bcftools", ["."])
4030        )
4031        log.debug("Databases annotations: " + str(databases_folders))
4032
4033        # Param
4034        annotations = (
4035            self.get_param()
4036            .get("annotation", {})
4037            .get("bcftools", {})
4038            .get("annotations", None)
4039        )
4040        log.debug("Annotations: " + str(annotations))
4041
4042        # Assembly
4043        assembly = self.get_param().get(
4044            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
4045        )
4046
4047        # Data
4048        table_variants = self.get_table_variants()
4049
4050        # Check if not empty
4051        log.debug("Check if not empty")
4052        sql_query_chromosomes = (
4053            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4054        )
4055        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
4056        if not sql_query_chromosomes_df["count"][0]:
4057            log.info(f"VCF empty")
4058            return
4059
4060        # Export in VCF
4061        log.debug("Create initial file to annotate")
4062        tmp_vcf = NamedTemporaryFile(
4063            prefix=self.get_prefix(),
4064            dir=self.get_tmp_dir(),
4065            suffix=".vcf.gz",
4066            delete=False,
4067        )
4068        tmp_vcf_name = tmp_vcf.name
4069
4070        # VCF header
4071        vcf_reader = self.get_header()
4072        log.debug("Initial header: " + str(vcf_reader.infos))
4073
4074        # Existing annotations
4075        for vcf_annotation in self.get_header().infos:
4076
4077            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
4078            log.debug(
4079                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
4080            )
4081
4082        if annotations:
4083
4084            tmp_ann_vcf_list = []
4085            commands = []
4086            tmp_files = []
4087            err_files = []
4088
4089            for annotation in annotations:
4090                annotation_fields = annotations[annotation]
4091
4092                # Annotation Name
4093                annotation_name = os.path.basename(annotation)
4094
4095                if not annotation_fields:
4096                    annotation_fields = {"INFO": None}
4097
4098                log.debug(f"Annotation '{annotation_name}'")
4099                log.debug(
4100                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
4101                )
4102
4103                # Create Database
4104                database = Database(
4105                    database=annotation,
4106                    databases_folders=databases_folders,
4107                    assembly=assembly,
4108                )
4109
4110                # Find files
4111                db_file = database.get_database()
4112                db_file = full_path(db_file)
4113                db_hdr_file = database.get_header_file()
4114                db_hdr_file = full_path(db_hdr_file)
4115                db_file_type = database.get_format()
4116                db_tbi_file = f"{db_file}.tbi"
4117                db_file_compressed = database.is_compressed()
4118
4119                # Check if compressed
4120                if not db_file_compressed:
4121                    log.error(
4122                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
4123                    )
4124                    raise ValueError(
4125                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
4126                    )
4127
4128                # Check if indexed
4129                if not os.path.exists(db_tbi_file):
4130                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
4131                    raise ValueError(
4132                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
4133                    )
4134
4135                # Check index - try to create if not exists
4136                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
4137                    log.error("Annotation failed: database not valid")
4138                    log.error(f"Annotation annotation file: {db_file}")
4139                    log.error(f"Annotation annotation header: {db_hdr_file}")
4140                    log.error(f"Annotation annotation index: {db_tbi_file}")
4141                    raise ValueError(
4142                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
4143                    )
4144                else:
4145
4146                    log.debug(
4147                        f"Annotation '{annotation}' - file: "
4148                        + str(db_file)
4149                        + " and "
4150                        + str(db_hdr_file)
4151                    )
4152
4153                    # Load header as VCF object
4154                    db_hdr_vcf = Variants(input=db_hdr_file)
4155                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
4156                    log.debug(
4157                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
4158                    )
4159
4160                    # For all fields in database
4161                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
4162                        annotation_fields = {
4163                            key: key for key in db_hdr_vcf_header_infos
4164                        }
4165                        log.debug(
4166                            "Annotation database header - All annotations added: "
4167                            + str(annotation_fields)
4168                        )
4169
4170                    # Number of fields
4171                    nb_annotation_field = 0
4172                    annotation_list = []
4173
4174                    for annotation_field in annotation_fields:
4175
4176                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
4177                        annotation_fields_new_name = annotation_fields.get(
4178                            annotation_field, annotation_field
4179                        )
4180                        if not annotation_fields_new_name:
4181                            annotation_fields_new_name = annotation_field
4182
4183                        # Check if field is in DB and if field is not elready in input data
4184                        if (
4185                            annotation_field in db_hdr_vcf.get_header().infos
4186                            and annotation_fields_new_name
4187                            not in self.get_header().infos
4188                        ):
4189
4190                            log.info(
4191                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
4192                            )
4193
4194                            # Add INFO field to header
4195                            db_hdr_vcf_header_infos_number = (
4196                                db_hdr_vcf_header_infos[annotation_field].num or "."
4197                            )
4198                            db_hdr_vcf_header_infos_type = (
4199                                db_hdr_vcf_header_infos[annotation_field].type
4200                                or "String"
4201                            )
4202                            db_hdr_vcf_header_infos_description = (
4203                                db_hdr_vcf_header_infos[annotation_field].desc
4204                                or f"{annotation_field} description"
4205                            )
4206                            db_hdr_vcf_header_infos_source = (
4207                                db_hdr_vcf_header_infos[annotation_field].source
4208                                or "unknown"
4209                            )
4210                            db_hdr_vcf_header_infos_version = (
4211                                db_hdr_vcf_header_infos[annotation_field].version
4212                                or "unknown"
4213                            )
4214
4215                            vcf_reader.infos[annotation_fields_new_name] = (
4216                                vcf.parser._Info(
4217                                    annotation_fields_new_name,
4218                                    db_hdr_vcf_header_infos_number,
4219                                    db_hdr_vcf_header_infos_type,
4220                                    db_hdr_vcf_header_infos_description,
4221                                    db_hdr_vcf_header_infos_source,
4222                                    db_hdr_vcf_header_infos_version,
4223                                    self.code_type_map[db_hdr_vcf_header_infos_type],
4224                                )
4225                            )
4226
4227                            # annotation_list.append(annotation_field)
4228                            if annotation_field != annotation_fields_new_name:
4229                                annotation_list.append(
4230                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
4231                                )
4232                            else:
4233                                annotation_list.append(annotation_field)
4234
4235                            nb_annotation_field += 1
4236
4237                        else:
4238
4239                            if annotation_field not in db_hdr_vcf.get_header().infos:
4240                                log.warning(
4241                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
4242                                )
4243                            if annotation_fields_new_name in self.get_header().infos:
4244                                log.warning(
4245                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
4246                                )
4247
4248                    log.info(
4249                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
4250                    )
4251
4252                    annotation_infos = ",".join(annotation_list)
4253
4254                    if annotation_infos != "":
4255
4256                        # Protect header for bcftools (remove "#CHROM" and variants line)
4257                        log.debug("Protect Header file - remove #CHROM line if exists")
4258                        tmp_header_vcf = NamedTemporaryFile(
4259                            prefix=self.get_prefix(),
4260                            dir=self.get_tmp_dir(),
4261                            suffix=".hdr",
4262                            delete=False,
4263                        )
4264                        tmp_header_vcf_name = tmp_header_vcf.name
4265                        tmp_files.append(tmp_header_vcf_name)
4266                        # Command
4267                        if db_hdr_file.endswith(".gz"):
4268                            command_extract_header = f"zcat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
4269                        else:
4270                            command_extract_header = f"cat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
4271                        # Run
4272                        run_parallel_commands([command_extract_header], 1)
4273
4274                        # Find chomosomes
4275                        log.debug("Find chromosomes ")
4276                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
4277                        sql_query_chromosomes_df = self.get_query_to_df(
4278                            sql_query_chromosomes
4279                        )
4280                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
4281
4282                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
4283
4284                        # BED columns in the annotation file
4285                        if db_file_type in ["bed"]:
4286                            annotation_infos = "CHROM,POS,POS," + annotation_infos
4287
4288                        for chrom in chomosomes_list:
4289
4290                            # Create BED on initial VCF
4291                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
4292                            tmp_bed = NamedTemporaryFile(
4293                                prefix=self.get_prefix(),
4294                                dir=self.get_tmp_dir(),
4295                                suffix=".bed",
4296                                delete=False,
4297                            )
4298                            tmp_bed_name = tmp_bed.name
4299                            tmp_files.append(tmp_bed_name)
4300
4301                            # Detecte regions
4302                            log.debug(
4303                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
4304                            )
4305                            window = 1000000
4306                            sql_query_intervals_for_bed = f"""
4307                                SELECT  \"#CHROM\",
4308                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
4309                                        \"POS\"+{window}
4310                                FROM {table_variants} as table_variants
4311                                WHERE table_variants.\"#CHROM\" = '{chrom}'
4312                            """
4313                            regions = self.conn.execute(
4314                                sql_query_intervals_for_bed
4315                            ).fetchall()
4316                            merged_regions = merge_regions(regions)
4317                            log.debug(
4318                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
4319                            )
4320
4321                            header = ["#CHROM", "START", "END"]
4322                            with open(tmp_bed_name, "w") as f:
4323                                # Write the header with tab delimiter
4324                                f.write("\t".join(header) + "\n")
4325                                for d in merged_regions:
4326                                    # Write each data row with tab delimiter
4327                                    f.write("\t".join(map(str, d)) + "\n")
4328
4329                            # Tmp files
4330                            tmp_annotation_vcf = NamedTemporaryFile(
4331                                prefix=self.get_prefix(),
4332                                dir=self.get_tmp_dir(),
4333                                suffix=".vcf.gz",
4334                                delete=False,
4335                            )
4336                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
4337                            tmp_files.append(tmp_annotation_vcf_name)
4338                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
4339                            tmp_annotation_vcf_name_err = (
4340                                tmp_annotation_vcf_name + ".err"
4341                            )
4342                            err_files.append(tmp_annotation_vcf_name_err)
4343
4344                            # Annotate Command
4345                            log.debug(
4346                                f"Annotation '{annotation}' - add bcftools command"
4347                            )
4348
4349                            # Command
4350                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
4351
4352                            # Add command
4353                            commands.append(command_annotate)
4354
4355            # if some commands
4356            if commands:
4357
4358                # Export VCF file
4359                self.export_variant_vcf(
4360                    vcf_file=tmp_vcf_name,
4361                    remove_info=True,
4362                    add_samples=False,
4363                    index=True,
4364                )
4365
4366                # Threads
4367                # calculate threads for annotated commands
4368                if commands:
4369                    threads_bcftools_annotate = round(threads / len(commands))
4370                else:
4371                    threads_bcftools_annotate = 1
4372
4373                if not threads_bcftools_annotate:
4374                    threads_bcftools_annotate = 1
4375
4376                # Add threads option to bcftools commands
4377                if threads_bcftools_annotate > 1:
4378                    commands_threaded = []
4379                    for command in commands:
4380                        commands_threaded.append(
4381                            command.replace(
4382                                f"{bcftools_bin_command} annotate ",
4383                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
4384                            )
4385                        )
4386                    commands = commands_threaded
4387
4388                # Command annotation multithreading
4389                log.debug(f"Annotation - Annotation commands: " + str(commands))
4390                log.info(
4391                    f"Annotation - Annotation multithreaded in "
4392                    + str(len(commands))
4393                    + " commands"
4394                )
4395
4396                run_parallel_commands(commands, threads)
4397
4398                # Merge
4399                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
4400
4401                if tmp_ann_vcf_list_cmd:
4402
4403                    # Tmp file
4404                    tmp_annotate_vcf = NamedTemporaryFile(
4405                        prefix=self.get_prefix(),
4406                        dir=self.get_tmp_dir(),
4407                        suffix=".vcf.gz",
4408                        delete=True,
4409                    )
4410                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
4411                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
4412                    err_files.append(tmp_annotate_vcf_name_err)
4413
4414                    # Tmp file remove command
4415                    tmp_files_remove_command = ""
4416                    if tmp_files:
4417                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
4418
4419                    # Command merge
4420                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
4421                    log.info(
4422                        f"Annotation - Annotation merging "
4423                        + str(len(commands))
4424                        + " annotated files"
4425                    )
4426                    log.debug(f"Annotation - merge command: {merge_command}")
4427                    run_parallel_commands([merge_command], 1)
4428
4429                    # Error messages
4430                    log.info(f"Error/Warning messages:")
4431                    error_message_command_all = []
4432                    error_message_command_warning = []
4433                    error_message_command_err = []
4434                    for err_file in err_files:
4435                        with open(err_file, "r") as f:
4436                            for line in f:
4437                                message = line.strip()
4438                                error_message_command_all.append(message)
4439                                if line.startswith("[W::"):
4440                                    error_message_command_warning.append(message)
4441                                if line.startswith("[E::"):
4442                                    error_message_command_err.append(
4443                                        f"{err_file}: " + message
4444                                    )
4445                    # log info
4446                    for message in list(
4447                        set(error_message_command_err + error_message_command_warning)
4448                    ):
4449                        log.info(f"   {message}")
4450                    # debug info
4451                    for message in list(set(error_message_command_all)):
4452                        log.debug(f"   {message}")
4453                    # failed
4454                    if len(error_message_command_err):
4455                        log.error("Annotation failed: Error in commands")
4456                        raise ValueError("Annotation failed: Error in commands")
4457
4458                    # Update variants
4459                    log.info(f"Annotation - Updating...")
4460                    self.update_from_vcf(tmp_annotate_vcf_name)

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_exomiser(self, threads: int = None) -> None:
4462    def annotation_exomiser(self, threads: int = None) -> None:
4463        """
4464        This function annotate with Exomiser
4465
4466        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
4467        - "analysis" (dict/file):
4468            Full analysis dictionnary parameters (see Exomiser docs).
4469            Either a dict, or a file in JSON or YAML format.
4470            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
4471            Default : None
4472        - "preset" (string):
4473            Analysis preset (available in config folder).
4474            Used if no full "analysis" is provided.
4475            Default: "exome"
4476        - "phenopacket" (dict/file):
4477            Samples and phenotipic features parameters (see Exomiser docs).
4478            Either a dict, or a file in JSON or YAML format.
4479            Default: None
4480        - "subject" (dict):
4481            Sample parameters (see Exomiser docs).
4482            Example:
4483                "subject":
4484                    {
4485                        "id": "ISDBM322017",
4486                        "sex": "FEMALE"
4487                    }
4488            Default: None
4489        - "sample" (string):
4490            Sample name to construct "subject" section:
4491                "subject":
4492                    {
4493                        "id": "<sample>",
4494                        "sex": "UNKNOWN_SEX"
4495                    }
4496            Default: None
4497        - "phenotypicFeatures" (dict)
4498            Phenotypic features to construct "subject" section.
4499            Example:
4500                "phenotypicFeatures":
4501                    [
4502                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
4503                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
4504                    ]
4505        - "hpo" (list)
4506            List of HPO ids as phenotypic features.
4507            Example:
4508                "hpo": ['0001156', '0001363', '0011304', '0010055']
4509            Default: []
4510        - "outputOptions" (dict):
4511            Output options (see Exomiser docs).
4512            Default:
4513                "output_options" =
4514                    {
4515                        "outputContributingVariantsOnly": False,
4516                        "numGenes": 0,
4517                        "outputFormats": ["TSV_VARIANT", "VCF"]
4518                    }
4519        - "transcript_source" (string):
4520            Transcript source (either "refseq", "ucsc", "ensembl")
4521            Default: "refseq"
4522        - "exomiser_to_info" (boolean):
4523            Add exomiser TSV file columns as INFO fields in VCF.
4524            Default: False
4525        - "release" (string):
4526            Exomise database release.
4527            If not exists, database release will be downloaded (take a while).
4528            Default: None (provided by application.properties configuration file)
4529        - "exomiser_application_properties" (file):
4530            Exomiser configuration file (see Exomiser docs).
4531            Useful to automatically download databases (especially for specific genome databases).
4532
4533        Notes:
4534        - If no sample in parameters, first sample in VCF will be chosen
4535        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
4536
4537        :param threads: The number of threads to use
4538        :return: None.
4539        """
4540
4541        # DEBUG
4542        log.debug("Start annotation with Exomiser databases")
4543
4544        # Threads
4545        if not threads:
4546            threads = self.get_threads()
4547        log.debug("Threads: " + str(threads))
4548
4549        # Config
4550        config = self.get_config()
4551        log.debug("Config: " + str(config))
4552
4553        # Config - Folders - Databases
4554        databases_folders = (
4555            config.get("folders", {})
4556            .get("databases", {})
4557            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
4558        )
4559        databases_folders = full_path(databases_folders)
4560        if not os.path.exists(databases_folders):
4561            log.error(f"Databases annotations: {databases_folders} NOT found")
4562        log.debug("Databases annotations: " + str(databases_folders))
4563
4564        # Config - Exomiser
4565        exomiser_bin_command = get_bin_command(
4566            bin="exomiser-cli*.jar",
4567            tool="exomiser",
4568            bin_type="jar",
4569            config=config,
4570            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
4571        )
4572        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
4573        if not exomiser_bin_command:
4574            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
4575            log.error(msg_err)
4576            raise ValueError(msg_err)
4577
4578        # Param
4579        param = self.get_param()
4580        log.debug("Param: " + str(param))
4581
4582        # Param - Exomiser
4583        param_exomiser = param.get("annotation", {}).get("exomiser", {})
4584        log.debug(f"Param Exomiser: {param_exomiser}")
4585
4586        # Param - Assembly
4587        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4588        log.debug("Assembly: " + str(assembly))
4589
4590        # Data
4591        table_variants = self.get_table_variants()
4592
4593        # Check if not empty
4594        log.debug("Check if not empty")
4595        sql_query_chromosomes = (
4596            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4597        )
4598        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4599            log.info(f"VCF empty")
4600            return False
4601
4602        # VCF header
4603        vcf_reader = self.get_header()
4604        log.debug("Initial header: " + str(vcf_reader.infos))
4605
4606        # Samples
4607        samples = self.get_header_sample_list()
4608        if not samples:
4609            log.error("No Samples in VCF")
4610            return False
4611        log.debug(f"Samples: {samples}")
4612
4613        # Memory limit
4614        memory_limit = self.get_memory("8G")
4615        log.debug(f"memory_limit: {memory_limit}")
4616
4617        # Exomiser java options
4618        exomiser_java_options = (
4619            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4620        )
4621        log.debug(f"Exomiser java options: {exomiser_java_options}")
4622
4623        # Download Exomiser (if not exists)
4624        exomiser_release = param_exomiser.get("release", None)
4625        exomiser_application_properties = param_exomiser.get(
4626            "exomiser_application_properties", None
4627        )
4628        databases_download_exomiser(
4629            assemblies=[assembly],
4630            exomiser_folder=databases_folders,
4631            exomiser_release=exomiser_release,
4632            exomiser_phenotype_release=exomiser_release,
4633            exomiser_application_properties=exomiser_application_properties,
4634        )
4635
4636        # Force annotation
4637        force_update_annotation = True
4638
4639        if "Exomiser" not in self.get_header().infos or force_update_annotation:
4640            log.debug("Start annotation Exomiser")
4641
4642            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
4643
4644                # tmp_dir = "/tmp/exomiser"
4645
4646                ### ANALYSIS ###
4647                ################
4648
4649                # Create analysis.json through analysis dict
4650                # either analysis in param or by default
4651                # depending on preset exome/genome)
4652
4653                # Init analysis dict
4654                param_exomiser_analysis_dict = {}
4655
4656                # analysis from param
4657                param_exomiser_analysis = param_exomiser.get("analysis", {})
4658                param_exomiser_analysis = full_path(param_exomiser_analysis)
4659
4660                # If analysis in param -> load anlaysis json
4661                if param_exomiser_analysis:
4662
4663                    # If param analysis is a file and exists
4664                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
4665                        param_exomiser_analysis
4666                    ):
4667                        # Load analysis file into analysis dict (either yaml or json)
4668                        with open(param_exomiser_analysis) as json_file:
4669                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
4670
4671                    # If param analysis is a dict
4672                    elif isinstance(param_exomiser_analysis, dict):
4673                        # Load analysis dict into analysis dict (either yaml or json)
4674                        param_exomiser_analysis_dict = param_exomiser_analysis
4675
4676                    # Error analysis type
4677                    else:
4678                        log.error(f"Analysis type unknown. Check param file.")
4679                        raise ValueError(f"Analysis type unknown. Check param file.")
4680
4681                # Case no input analysis config file/dict
4682                # Use preset (exome/genome) to open default config file
4683                if not param_exomiser_analysis_dict:
4684
4685                    # default preset
4686                    default_preset = "exome"
4687
4688                    # Get param preset or default preset
4689                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
4690
4691                    # Try to find if preset is a file
4692                    if os.path.exists(param_exomiser_preset):
4693                        # Preset file is provided in full path
4694                        param_exomiser_analysis_default_config_file = (
4695                            param_exomiser_preset
4696                        )
4697                    # elif os.path.exists(full_path(param_exomiser_preset)):
4698                    #     # Preset file is provided in full path
4699                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
4700                    elif os.path.exists(
4701                        os.path.join(folder_config, param_exomiser_preset)
4702                    ):
4703                        # Preset file is provided a basename in config folder (can be a path with subfolders)
4704                        param_exomiser_analysis_default_config_file = os.path.join(
4705                            folder_config, param_exomiser_preset
4706                        )
4707                    else:
4708                        # Construct preset file
4709                        param_exomiser_analysis_default_config_file = os.path.join(
4710                            folder_config,
4711                            f"preset-{param_exomiser_preset}-analysis.json",
4712                        )
4713
4714                    # If preset file exists
4715                    param_exomiser_analysis_default_config_file = full_path(
4716                        param_exomiser_analysis_default_config_file
4717                    )
4718                    if os.path.exists(param_exomiser_analysis_default_config_file):
4719                        # Load prest file into analysis dict (either yaml or json)
4720                        with open(
4721                            param_exomiser_analysis_default_config_file
4722                        ) as json_file:
4723                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
4724                                json_file
4725                            )
4726
4727                    # Error preset file
4728                    else:
4729                        log.error(
4730                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4731                        )
4732                        raise ValueError(
4733                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4734                        )
4735
4736                # If no analysis dict created
4737                if not param_exomiser_analysis_dict:
4738                    log.error(f"No analysis config")
4739                    raise ValueError(f"No analysis config")
4740
4741                # Log
4742                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4743
4744                ### PHENOPACKET ###
4745                ###################
4746
4747                # If no PhenoPacket in analysis dict -> check in param
4748                if "phenopacket" not in param_exomiser_analysis_dict:
4749
4750                    # If PhenoPacket in param -> load anlaysis json
4751                    if param_exomiser.get("phenopacket", None):
4752
4753                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
4754                        param_exomiser_phenopacket = full_path(
4755                            param_exomiser_phenopacket
4756                        )
4757
4758                        # If param phenopacket is a file and exists
4759                        if isinstance(
4760                            param_exomiser_phenopacket, str
4761                        ) and os.path.exists(param_exomiser_phenopacket):
4762                            # Load phenopacket file into analysis dict (either yaml or json)
4763                            with open(param_exomiser_phenopacket) as json_file:
4764                                param_exomiser_analysis_dict["phenopacket"] = (
4765                                    yaml.safe_load(json_file)
4766                                )
4767
4768                        # If param phenopacket is a dict
4769                        elif isinstance(param_exomiser_phenopacket, dict):
4770                            # Load phenopacket dict into analysis dict (either yaml or json)
4771                            param_exomiser_analysis_dict["phenopacket"] = (
4772                                param_exomiser_phenopacket
4773                            )
4774
4775                        # Error phenopacket type
4776                        else:
4777                            log.error(f"Phenopacket type unknown. Check param file.")
4778                            raise ValueError(
4779                                f"Phenopacket type unknown. Check param file."
4780                            )
4781
4782                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
4783                if "phenopacket" not in param_exomiser_analysis_dict:
4784
4785                    # Init PhenoPacket
4786                    param_exomiser_analysis_dict["phenopacket"] = {
4787                        "id": "analysis",
4788                        "proband": {},
4789                    }
4790
4791                    ### Add subject ###
4792
4793                    # If subject exists
4794                    param_exomiser_subject = param_exomiser.get("subject", {})
4795
4796                    # If subject not exists -> found sample ID
4797                    if not param_exomiser_subject:
4798
4799                        # Found sample ID in param
4800                        sample = param_exomiser.get("sample", None)
4801
4802                        # Find sample ID (first sample)
4803                        if not sample:
4804                            sample_list = self.get_header_sample_list()
4805                            if len(sample_list) > 0:
4806                                sample = sample_list[0]
4807                            else:
4808                                log.error(f"No sample found")
4809                                raise ValueError(f"No sample found")
4810
4811                        # Create subject
4812                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
4813
4814                    # Add to dict
4815                    param_exomiser_analysis_dict["phenopacket"][
4816                        "subject"
4817                    ] = param_exomiser_subject
4818
4819                    ### Add "phenotypicFeatures" ###
4820
4821                    # If phenotypicFeatures exists
4822                    param_exomiser_phenotypicfeatures = param_exomiser.get(
4823                        "phenotypicFeatures", []
4824                    )
4825
4826                    # If phenotypicFeatures not exists -> Try to infer from hpo list
4827                    if not param_exomiser_phenotypicfeatures:
4828
4829                        # Found HPO in param
4830                        param_exomiser_hpo = param_exomiser.get("hpo", [])
4831
4832                        # Split HPO if list in string format separated by comma
4833                        if isinstance(param_exomiser_hpo, str):
4834                            param_exomiser_hpo = param_exomiser_hpo.split(",")
4835
4836                        # Create HPO list
4837                        for hpo in param_exomiser_hpo:
4838                            hpo_clean = re.sub("[^0-9]", "", hpo)
4839                            param_exomiser_phenotypicfeatures.append(
4840                                {
4841                                    "type": {
4842                                        "id": f"HP:{hpo_clean}",
4843                                        "label": f"HP:{hpo_clean}",
4844                                    }
4845                                }
4846                            )
4847
4848                    # Add to dict
4849                    param_exomiser_analysis_dict["phenopacket"][
4850                        "phenotypicFeatures"
4851                    ] = param_exomiser_phenotypicfeatures
4852
4853                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
4854                    if not param_exomiser_phenotypicfeatures:
4855                        for step in param_exomiser_analysis_dict.get(
4856                            "analysis", {}
4857                        ).get("steps", []):
4858                            if "hiPhivePrioritiser" in step:
4859                                param_exomiser_analysis_dict.get("analysis", {}).get(
4860                                    "steps", []
4861                                ).remove(step)
4862
4863                ### Add Input File ###
4864
4865                # Initial file name and htsFiles
4866                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
4867                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
4868                    {
4869                        "uri": tmp_vcf_name,
4870                        "htsFormat": "VCF",
4871                        "genomeAssembly": assembly,
4872                    }
4873                ]
4874
4875                ### Add metaData ###
4876
4877                # If metaData not in analysis dict
4878                if "metaData" not in param_exomiser_analysis_dict:
4879                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
4880                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
4881                        "createdBy": "howard",
4882                        "phenopacketSchemaVersion": 1,
4883                    }
4884
4885                ### OutputOptions ###
4886
4887                # Init output result folder
4888                output_results = os.path.join(tmp_dir, "results")
4889
4890                # If no outputOptions in analysis dict
4891                if "outputOptions" not in param_exomiser_analysis_dict:
4892
4893                    # default output formats
4894                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
4895
4896                    # Get outputOptions in param
4897                    output_options = param_exomiser.get("outputOptions", None)
4898
4899                    # If no output_options in param -> check
4900                    if not output_options:
4901                        output_options = {
4902                            "outputContributingVariantsOnly": False,
4903                            "numGenes": 0,
4904                            "outputFormats": defaut_output_formats,
4905                        }
4906
4907                    # Replace outputDirectory in output options
4908                    output_options["outputDirectory"] = output_results
4909                    output_options["outputFileName"] = "howard"
4910
4911                    # Add outputOptions in analysis dict
4912                    param_exomiser_analysis_dict["outputOptions"] = output_options
4913
4914                else:
4915
4916                    # Replace output_results and output format (if exists in param)
4917                    param_exomiser_analysis_dict["outputOptions"][
4918                        "outputDirectory"
4919                    ] = output_results
4920                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
4921                        list(
4922                            set(
4923                                param_exomiser_analysis_dict.get(
4924                                    "outputOptions", {}
4925                                ).get("outputFormats", [])
4926                                + ["TSV_VARIANT", "VCF"]
4927                            )
4928                        )
4929                    )
4930
4931                # log
4932                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4933
4934                ### ANALYSIS FILE ###
4935                #####################
4936
4937                ### Full JSON analysis config file ###
4938
4939                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
4940                with open(exomiser_analysis, "w") as fp:
4941                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
4942
4943                ### SPLIT analysis and sample config files
4944
4945                # Splitted analysis dict
4946                param_exomiser_analysis_dict_for_split = (
4947                    param_exomiser_analysis_dict.copy()
4948                )
4949
4950                # Phenopacket JSON file
4951                exomiser_analysis_phenopacket = os.path.join(
4952                    tmp_dir, "analysis_phenopacket.json"
4953                )
4954                with open(exomiser_analysis_phenopacket, "w") as fp:
4955                    json.dump(
4956                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
4957                        fp,
4958                        indent=4,
4959                    )
4960
4961                # Analysis JSON file without Phenopacket parameters
4962                param_exomiser_analysis_dict_for_split.pop("phenopacket")
4963                exomiser_analysis_analysis = os.path.join(
4964                    tmp_dir, "analysis_analysis.json"
4965                )
4966                with open(exomiser_analysis_analysis, "w") as fp:
4967                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
4968
4969                ### INITAL VCF file ###
4970                #######################
4971
4972                ### Create list of samples to use and include inti initial VCF file ####
4973
4974                # Subject (main sample)
4975                # Get sample ID in analysis dict
4976                sample_subject = (
4977                    param_exomiser_analysis_dict.get("phenopacket", {})
4978                    .get("subject", {})
4979                    .get("id", None)
4980                )
4981                sample_proband = (
4982                    param_exomiser_analysis_dict.get("phenopacket", {})
4983                    .get("proband", {})
4984                    .get("subject", {})
4985                    .get("id", None)
4986                )
4987                sample = []
4988                if sample_subject:
4989                    sample.append(sample_subject)
4990                if sample_proband:
4991                    sample.append(sample_proband)
4992
4993                # Get sample ID within Pedigree
4994                pedigree_persons_list = (
4995                    param_exomiser_analysis_dict.get("phenopacket", {})
4996                    .get("pedigree", {})
4997                    .get("persons", {})
4998                )
4999
5000                # Create list with all sample ID in pedigree (if exists)
5001                pedigree_persons = []
5002                for person in pedigree_persons_list:
5003                    pedigree_persons.append(person.get("individualId"))
5004
5005                # Concat subject sample ID and samples ID in pedigreesamples
5006                samples = list(set(sample + pedigree_persons))
5007
5008                # Check if sample list is not empty
5009                if not samples:
5010                    log.error(f"No samples found")
5011                    raise ValueError(f"No samples found")
5012
5013                # Create VCF with sample (either sample in param or first one by default)
5014                # Export VCF file
5015                self.export_variant_vcf(
5016                    vcf_file=tmp_vcf_name,
5017                    remove_info=True,
5018                    add_samples=True,
5019                    list_samples=samples,
5020                    index=False,
5021                )
5022
5023                ### Execute Exomiser ###
5024                ########################
5025
5026                # Init command
5027                exomiser_command = ""
5028
5029                # Command exomiser options
5030                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
5031
5032                # Release
5033                exomiser_release = param_exomiser.get("release", None)
5034                if exomiser_release:
5035                    # phenotype data version
5036                    exomiser_options += (
5037                        f" --exomiser.phenotype.data-version={exomiser_release} "
5038                    )
5039                    # data version
5040                    exomiser_options += (
5041                        f" --exomiser.{assembly}.data-version={exomiser_release} "
5042                    )
5043                    # variant white list
5044                    variant_white_list_file = (
5045                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
5046                    )
5047                    if os.path.exists(
5048                        os.path.join(
5049                            databases_folders, assembly, variant_white_list_file
5050                        )
5051                    ):
5052                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
5053
5054                # transcript_source
5055                transcript_source = param_exomiser.get(
5056                    "transcript_source", None
5057                )  # ucsc, refseq, ensembl
5058                if transcript_source:
5059                    exomiser_options += (
5060                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
5061                    )
5062
5063                # If analysis contain proband param
5064                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
5065                    "proband", {}
5066                ):
5067                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
5068
5069                # If no proband (usually uniq sample)
5070                else:
5071                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
5072
5073                # Log
5074                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
5075
5076                # Run command
5077                result = subprocess.call(
5078                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
5079                )
5080                if result:
5081                    log.error("Exomiser command failed")
5082                    raise ValueError("Exomiser command failed")
5083
5084                ### RESULTS ###
5085                ###############
5086
5087                ### Annotate with TSV fields ###
5088
5089                # Init result tsv file
5090                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
5091
5092                # Init result tsv file
5093                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
5094
5095                # Parse TSV file and explode columns in INFO field
5096                if exomiser_to_info and os.path.exists(output_results_tsv):
5097
5098                    # Log
5099                    log.debug("Exomiser columns to VCF INFO field")
5100
5101                    # Retrieve columns and types
5102                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
5103                    output_results_tsv_df = self.get_query_to_df(query)
5104                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
5105
5106                    # Init concat fields for update
5107                    sql_query_update_concat_fields = []
5108
5109                    # Fields to avoid
5110                    fields_to_avoid = [
5111                        "CONTIG",
5112                        "START",
5113                        "END",
5114                        "REF",
5115                        "ALT",
5116                        "QUAL",
5117                        "FILTER",
5118                        "GENOTYPE",
5119                    ]
5120
5121                    # List all columns to add into header
5122                    for header_column in output_results_tsv_columns:
5123
5124                        # If header column is enable
5125                        if header_column not in fields_to_avoid:
5126
5127                            # Header info type
5128                            header_info_type = "String"
5129                            header_column_df = output_results_tsv_df[header_column]
5130                            header_column_df_dtype = header_column_df.dtype
5131                            if header_column_df_dtype == object:
5132                                if (
5133                                    pd.to_numeric(header_column_df, errors="coerce")
5134                                    .notnull()
5135                                    .all()
5136                                ):
5137                                    header_info_type = "Float"
5138                            else:
5139                                header_info_type = "Integer"
5140
5141                            # Header info
5142                            characters_to_validate = ["-"]
5143                            pattern = "[" + "".join(characters_to_validate) + "]"
5144                            header_info_name = re.sub(
5145                                pattern,
5146                                "_",
5147                                f"Exomiser_{header_column}".replace("#", ""),
5148                            )
5149                            header_info_number = "."
5150                            header_info_description = (
5151                                f"Exomiser {header_column} annotation"
5152                            )
5153                            header_info_source = "Exomiser"
5154                            header_info_version = "unknown"
5155                            header_info_code = CODE_TYPE_MAP[header_info_type]
5156                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
5157                                header_info_name,
5158                                header_info_number,
5159                                header_info_type,
5160                                header_info_description,
5161                                header_info_source,
5162                                header_info_version,
5163                                header_info_code,
5164                            )
5165
5166                            # Add field to add for update to concat fields
5167                            sql_query_update_concat_fields.append(
5168                                f"""
5169                                CASE
5170                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
5171                                    THEN concat(
5172                                        '{header_info_name}=',
5173                                        table_parquet."{header_column}",
5174                                        ';'
5175                                        )
5176
5177                                    ELSE ''
5178                                END
5179                            """
5180                            )
5181
5182                    # Update query
5183                    sql_query_update = f"""
5184                        UPDATE {table_variants} as table_variants
5185                            SET INFO = concat(
5186                                            CASE
5187                                                WHEN INFO NOT IN ('', '.')
5188                                                THEN INFO
5189                                                ELSE ''
5190                                            END,
5191                                            CASE
5192                                                WHEN table_variants.INFO NOT IN ('','.')
5193                                                THEN ';'
5194                                                ELSE ''
5195                                            END,
5196                                            (
5197                                            SELECT 
5198                                                concat(
5199                                                    {",".join(sql_query_update_concat_fields)}
5200                                                )
5201                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
5202                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
5203                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
5204                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
5205                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
5206                                            )
5207                                        )
5208                            ;
5209                        """
5210
5211                    # Update
5212                    self.conn.execute(sql_query_update)
5213
5214                ### Annotate with VCF INFO field ###
5215
5216                # Init result VCF file
5217                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
5218
5219                # If VCF exists
5220                if os.path.exists(output_results_vcf):
5221
5222                    # Log
5223                    log.debug("Exomiser result VCF update variants")
5224
5225                    # Find Exomiser INFO field annotation in header
5226                    with gzip.open(output_results_vcf, "rt") as f:
5227                        header_list = self.read_vcf_header(f)
5228                    exomiser_vcf_header = vcf.Reader(
5229                        io.StringIO("\n".join(header_list))
5230                    )
5231
5232                    # Add annotation INFO field to header
5233                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
5234
5235                    # Update variants with VCF
5236                    self.update_from_vcf(output_results_vcf)
5237
5238        return True

This function annotate with Exomiser

This function uses args as parameters, in section "annotation" -> "exomiser", with sections:

  • "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
  • "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
  • "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
  • "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
  • "sample" (string): Sample name to construct "subject" section: "subject": { "id": "", "sex": "UNKNOWN_SEX" } Default: None
  • "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
  • "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
  • "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
  • "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
  • "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
  • "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
  • "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).

Notes:

  • If no sample in parameters, first sample in VCF will be chosen
  • If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
  • threads: The number of threads to use
Returns

None.

def annotation_snpeff(self, threads: int = None) -> None:
5240    def annotation_snpeff(self, threads: int = None) -> None:
5241        """
5242        This function annotate with snpEff
5243
5244        :param threads: The number of threads to use
5245        :return: the value of the variable "return_value".
5246        """
5247
5248        # DEBUG
5249        log.debug("Start annotation with snpeff databases")
5250
5251        # Threads
5252        if not threads:
5253            threads = self.get_threads()
5254        log.debug("Threads: " + str(threads))
5255
5256        # DEBUG
5257        delete_tmp = True
5258        if self.get_config().get("verbosity", "warning") in ["debug"]:
5259            delete_tmp = False
5260            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5261
5262        # Config
5263        config = self.get_config()
5264        log.debug("Config: " + str(config))
5265
5266        # Config - Folders - Databases
5267        databases_folders = (
5268            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
5269        )
5270        log.debug("Databases annotations: " + str(databases_folders))
5271
5272        # Config - snpEff bin command
5273        snpeff_bin_command = get_bin_command(
5274            bin="snpEff.jar",
5275            tool="snpeff",
5276            bin_type="jar",
5277            config=config,
5278            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
5279        )
5280        if not snpeff_bin_command:
5281            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
5282            log.error(msg_err)
5283            raise ValueError(msg_err)
5284
5285        # Config - snpEff databases
5286        snpeff_databases = (
5287            config.get("folders", {})
5288            .get("databases", {})
5289            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
5290        )
5291        snpeff_databases = full_path(snpeff_databases)
5292        if snpeff_databases is not None and snpeff_databases != "":
5293            log.debug(f"Create snpEff databases folder")
5294            if not os.path.exists(snpeff_databases):
5295                os.makedirs(snpeff_databases)
5296
5297        # Param
5298        param = self.get_param()
5299        log.debug("Param: " + str(param))
5300
5301        # Param
5302        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
5303        log.debug("Options: " + str(options))
5304
5305        # Param - Assembly
5306        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
5307
5308        # Param - Options
5309        snpeff_options = (
5310            param.get("annotation", {}).get("snpeff", {}).get("options", "")
5311        )
5312        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
5313        snpeff_csvstats = (
5314            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
5315        )
5316        if snpeff_stats:
5317            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
5318            snpeff_stats = full_path(snpeff_stats)
5319            snpeff_options += f" -stats {snpeff_stats}"
5320        if snpeff_csvstats:
5321            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
5322            snpeff_csvstats = full_path(snpeff_csvstats)
5323            snpeff_options += f" -csvStats {snpeff_csvstats}"
5324
5325        # Data
5326        table_variants = self.get_table_variants()
5327
5328        # Check if not empty
5329        log.debug("Check if not empty")
5330        sql_query_chromosomes = (
5331            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5332        )
5333        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
5334        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
5335            log.info(f"VCF empty")
5336            return
5337
5338        # Export in VCF
5339        log.debug("Create initial file to annotate")
5340        tmp_vcf = NamedTemporaryFile(
5341            prefix=self.get_prefix(),
5342            dir=self.get_tmp_dir(),
5343            suffix=".vcf.gz",
5344            delete=True,
5345        )
5346        tmp_vcf_name = tmp_vcf.name
5347
5348        # VCF header
5349        vcf_reader = self.get_header()
5350        log.debug("Initial header: " + str(vcf_reader.infos))
5351
5352        # Existing annotations
5353        for vcf_annotation in self.get_header().infos:
5354
5355            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5356            log.debug(
5357                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5358            )
5359
5360        # Memory limit
5361        # if config.get("memory", None):
5362        #     memory_limit = config.get("memory", "8G")
5363        # else:
5364        #     memory_limit = "8G"
5365        memory_limit = self.get_memory("8G")
5366        log.debug(f"memory_limit: {memory_limit}")
5367
5368        # snpEff java options
5369        snpeff_java_options = (
5370            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
5371        )
5372        log.debug(f"Exomiser java options: {snpeff_java_options}")
5373
5374        force_update_annotation = True
5375
5376        if "ANN" not in self.get_header().infos or force_update_annotation:
5377
5378            # Check snpEff database
5379            log.debug(f"Check snpEff databases {[assembly]}")
5380            databases_download_snpeff(
5381                folder=snpeff_databases, assemblies=[assembly], config=config
5382            )
5383
5384            # Export VCF file
5385            self.export_variant_vcf(
5386                vcf_file=tmp_vcf_name,
5387                remove_info=True,
5388                add_samples=False,
5389                index=True,
5390            )
5391
5392            # Tmp file
5393            err_files = []
5394            tmp_annotate_vcf = NamedTemporaryFile(
5395                prefix=self.get_prefix(),
5396                dir=self.get_tmp_dir(),
5397                suffix=".vcf",
5398                delete=False,
5399            )
5400            tmp_annotate_vcf_name = tmp_annotate_vcf.name
5401            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5402            err_files.append(tmp_annotate_vcf_name_err)
5403
5404            # Command
5405            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
5406            log.debug(f"Annotation - snpEff command: {snpeff_command}")
5407            run_parallel_commands([snpeff_command], 1)
5408
5409            # Error messages
5410            log.info(f"Error/Warning messages:")
5411            error_message_command_all = []
5412            error_message_command_warning = []
5413            error_message_command_err = []
5414            for err_file in err_files:
5415                with open(err_file, "r") as f:
5416                    for line in f:
5417                        message = line.strip()
5418                        error_message_command_all.append(message)
5419                        if line.startswith("[W::"):
5420                            error_message_command_warning.append(message)
5421                        if line.startswith("[E::"):
5422                            error_message_command_err.append(f"{err_file}: " + message)
5423            # log info
5424            for message in list(
5425                set(error_message_command_err + error_message_command_warning)
5426            ):
5427                log.info(f"   {message}")
5428            # debug info
5429            for message in list(set(error_message_command_all)):
5430                log.debug(f"   {message}")
5431            # failed
5432            if len(error_message_command_err):
5433                log.error("Annotation failed: Error in commands")
5434                raise ValueError("Annotation failed: Error in commands")
5435
5436            # Find annotation in header
5437            with open(tmp_annotate_vcf_name, "rt") as f:
5438                header_list = self.read_vcf_header(f)
5439            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5440
5441            for ann in annovar_vcf_header.infos:
5442                if ann not in self.get_header().infos:
5443                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5444
5445            # Update variants
5446            log.info(f"Annotation - Updating...")
5447            self.update_from_vcf(tmp_annotate_vcf_name)
5448
5449        else:
5450            if "ANN" in self.get_header().infos:
5451                log.debug(f"Existing snpEff annotations in VCF")
5452            if force_update_annotation:
5453                log.debug(f"Existing snpEff annotations in VCF - annotation forced")

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def annotation_annovar(self, threads: int = None) -> None:
5455    def annotation_annovar(self, threads: int = None) -> None:
5456        """
5457        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
5458        annotations
5459
5460        :param threads: number of threads to use
5461        :return: the value of the variable "return_value".
5462        """
5463
5464        # DEBUG
5465        log.debug("Start annotation with Annovar databases")
5466
5467        # Threads
5468        if not threads:
5469            threads = self.get_threads()
5470        log.debug("Threads: " + str(threads))
5471
5472        # Tmp en Err files
5473        tmp_files = []
5474        err_files = []
5475
5476        # DEBUG
5477        delete_tmp = True
5478        if self.get_config().get("verbosity", "warning") in ["debug"]:
5479            delete_tmp = False
5480            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5481
5482        # Config
5483        config = self.get_config()
5484        log.debug("Config: " + str(config))
5485
5486        # Config - Folders - Databases
5487        databases_folders = (
5488            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
5489        )
5490        log.debug("Databases annotations: " + str(databases_folders))
5491
5492        # Config - annovar bin command
5493        annovar_bin_command = get_bin_command(
5494            bin="table_annovar.pl",
5495            tool="annovar",
5496            bin_type="perl",
5497            config=config,
5498            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
5499        )
5500        if not annovar_bin_command:
5501            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
5502            log.error(msg_err)
5503            raise ValueError(msg_err)
5504
5505        # Config - BCFTools bin command
5506        bcftools_bin_command = get_bin_command(
5507            bin="bcftools",
5508            tool="bcftools",
5509            bin_type="bin",
5510            config=config,
5511            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
5512        )
5513        if not bcftools_bin_command:
5514            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
5515            log.error(msg_err)
5516            raise ValueError(msg_err)
5517
5518        # Config - annovar databases
5519        annovar_databases = (
5520            config.get("folders", {})
5521            .get("databases", {})
5522            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
5523        )
5524        if annovar_databases is not None:
5525            if isinstance(annovar_databases, list):
5526                annovar_databases = full_path(annovar_databases[0])
5527                log.warning(f"Annovar databases folder '{annovar_databases}' selected")
5528            annovar_databases = full_path(annovar_databases)
5529            if not os.path.exists(annovar_databases):
5530                log.info(f"Annovar databases folder '{annovar_databases}' created")
5531                Path(annovar_databases).mkdir(parents=True, exist_ok=True)
5532        else:
5533            msg_err = f"Annovar databases configuration failed"
5534            log.error(msg_err)
5535            raise ValueError(msg_err)
5536
5537        # Param
5538        param = self.get_param()
5539        log.debug("Param: " + str(param))
5540
5541        # Param - options
5542        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
5543        log.debug("Options: " + str(options))
5544
5545        # Param - annotations
5546        annotations = (
5547            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
5548        )
5549        log.debug("Annotations: " + str(annotations))
5550
5551        # Param - Assembly
5552        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
5553
5554        # Annovar database assembly
5555        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
5556        if annovar_databases_assembly != "" and not os.path.exists(
5557            annovar_databases_assembly
5558        ):
5559            os.makedirs(annovar_databases_assembly)
5560
5561        # Data
5562        table_variants = self.get_table_variants()
5563
5564        # Check if not empty
5565        log.debug("Check if not empty")
5566        sql_query_chromosomes = (
5567            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5568        )
5569        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
5570        if not sql_query_chromosomes_df["count"][0]:
5571            log.info(f"VCF empty")
5572            return
5573
5574        # VCF header
5575        vcf_reader = self.get_header()
5576        log.debug("Initial header: " + str(vcf_reader.infos))
5577
5578        # Existing annotations
5579        for vcf_annotation in self.get_header().infos:
5580
5581            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5582            log.debug(
5583                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5584            )
5585
5586        force_update_annotation = True
5587
5588        if annotations:
5589
5590            commands = []
5591            tmp_annotates_vcf_name_list = []
5592
5593            # Export in VCF
5594            log.debug("Create initial file to annotate")
5595            tmp_vcf = NamedTemporaryFile(
5596                prefix=self.get_prefix(),
5597                dir=self.get_tmp_dir(),
5598                suffix=".vcf.gz",
5599                delete=False,
5600            )
5601            tmp_vcf_name = tmp_vcf.name
5602            tmp_files.append(tmp_vcf_name)
5603            tmp_files.append(tmp_vcf_name + ".tbi")
5604
5605            # Export VCF file
5606            self.export_variant_vcf(
5607                vcf_file=tmp_vcf_name,
5608                remove_info=".",
5609                add_samples=False,
5610                index=True,
5611            )
5612
5613            # Create file for field rename
5614            log.debug("Create file for field rename")
5615            tmp_rename = NamedTemporaryFile(
5616                prefix=self.get_prefix(),
5617                dir=self.get_tmp_dir(),
5618                suffix=".rename",
5619                delete=False,
5620            )
5621            tmp_rename_name = tmp_rename.name
5622            tmp_files.append(tmp_rename_name)
5623
5624            # Check Annovar database
5625            log.debug(
5626                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
5627            )
5628            databases_download_annovar(
5629                folder=annovar_databases,
5630                files=list(annotations.keys()),
5631                assemblies=[assembly],
5632            )
5633
5634            for annotation in annotations:
5635                annotation_fields = annotations[annotation]
5636
5637                if not annotation_fields:
5638                    annotation_fields = {"INFO": None}
5639
5640                log.info(f"Annotations Annovar - database '{annotation}'")
5641                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
5642
5643                # Tmp file for annovar
5644                err_files = []
5645                tmp_annotate_vcf_directory = TemporaryDirectory(
5646                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
5647                )
5648                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
5649                tmp_annotate_vcf_name_annovar = (
5650                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
5651                )
5652                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
5653                err_files.append(tmp_annotate_vcf_name_err)
5654                tmp_files.append(tmp_annotate_vcf_name_err)
5655
5656                # Tmp file final vcf annotated by annovar
5657                tmp_annotate_vcf = NamedTemporaryFile(
5658                    prefix=self.get_prefix(),
5659                    dir=self.get_tmp_dir(),
5660                    suffix=".vcf.gz",
5661                    delete=False,
5662                )
5663                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5664                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
5665                tmp_files.append(tmp_annotate_vcf_name)
5666                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
5667
5668                # Number of fields
5669                annotation_list = []
5670                annotation_renamed_list = []
5671
5672                for annotation_field in annotation_fields:
5673
5674                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
5675                    annotation_fields_new_name = annotation_fields.get(
5676                        annotation_field, annotation_field
5677                    )
5678                    if not annotation_fields_new_name:
5679                        annotation_fields_new_name = annotation_field
5680
5681                    if (
5682                        force_update_annotation
5683                        or annotation_fields_new_name not in self.get_header().infos
5684                    ):
5685                        annotation_list.append(annotation_field)
5686                        annotation_renamed_list.append(annotation_fields_new_name)
5687                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
5688                        log.warning(
5689                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
5690                        )
5691
5692                    # Add rename info
5693                    run_parallel_commands(
5694                        [
5695                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
5696                        ],
5697                        1,
5698                    )
5699
5700                # log.debug("fields_to_removed: " + str(fields_to_removed))
5701                log.debug("annotation_list: " + str(annotation_list))
5702
5703                # protocol
5704                protocol = annotation
5705
5706                # argument
5707                argument = ""
5708
5709                # operation
5710                operation = "f"
5711                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
5712                    "ensGene"
5713                ):
5714                    operation = "g"
5715                    if options.get("genebase", None):
5716                        argument = f"""'{options.get("genebase","")}'"""
5717                elif annotation in ["cytoBand"]:
5718                    operation = "r"
5719
5720                # argument option
5721                argument_option = ""
5722                if argument != "":
5723                    argument_option = " --argument " + argument
5724
5725                # command options
5726                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
5727                for option in options:
5728                    if option not in ["genebase"]:
5729                        command_options += f""" --{option}={options[option]}"""
5730
5731                # Command
5732
5733                # Command - Annovar
5734                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
5735                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
5736
5737                # Command - start pipe
5738                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
5739
5740                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
5741                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
5742
5743                # Command - Special characters (refGene annotation)
5744                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
5745
5746                # Command - Clean empty fields (with value ".")
5747                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
5748
5749                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
5750                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
5751                if "ALL" not in annotation_list and "INFO" not in annotation_list:
5752                    # for ann in annotation_renamed_list:
5753                    for ann in annotation_list:
5754                        annovar_fields_to_keep.append(f"^INFO/{ann}")
5755
5756                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
5757
5758                # Command - indexing
5759                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
5760
5761                log.debug(f"Annotation - Annovar command: {command_annovar}")
5762                run_parallel_commands([command_annovar], 1)
5763
5764                # Error messages
5765                log.info(f"Error/Warning messages:")
5766                error_message_command_all = []
5767                error_message_command_warning = []
5768                error_message_command_err = []
5769                for err_file in err_files:
5770                    with open(err_file, "r") as f:
5771                        for line in f:
5772                            message = line.strip()
5773                            error_message_command_all.append(message)
5774                            if line.startswith("[W::") or line.startswith("WARNING"):
5775                                error_message_command_warning.append(message)
5776                            if line.startswith("[E::") or line.startswith("ERROR"):
5777                                error_message_command_err.append(
5778                                    f"{err_file}: " + message
5779                                )
5780                # log info
5781                for message in list(
5782                    set(error_message_command_err + error_message_command_warning)
5783                ):
5784                    log.info(f"   {message}")
5785                # debug info
5786                for message in list(set(error_message_command_all)):
5787                    log.debug(f"   {message}")
5788                # failed
5789                if len(error_message_command_err):
5790                    log.error("Annotation failed: Error in commands")
5791                    raise ValueError("Annotation failed: Error in commands")
5792
5793            if tmp_annotates_vcf_name_list:
5794
5795                # List of annotated files
5796                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
5797
5798                # Tmp file
5799                tmp_annotate_vcf = NamedTemporaryFile(
5800                    prefix=self.get_prefix(),
5801                    dir=self.get_tmp_dir(),
5802                    suffix=".vcf.gz",
5803                    delete=False,
5804                )
5805                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5806                tmp_files.append(tmp_annotate_vcf_name)
5807                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5808                err_files.append(tmp_annotate_vcf_name_err)
5809                tmp_files.append(tmp_annotate_vcf_name_err)
5810
5811                # Command merge
5812                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
5813                log.info(
5814                    f"Annotation Annovar - Annotation merging "
5815                    + str(len(tmp_annotates_vcf_name_list))
5816                    + " annotated files"
5817                )
5818                log.debug(f"Annotation - merge command: {merge_command}")
5819                run_parallel_commands([merge_command], 1)
5820
5821                # Find annotation in header
5822                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
5823                    header_list = self.read_vcf_header(f)
5824                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5825
5826                for ann in annovar_vcf_header.infos:
5827                    if ann not in self.get_header().infos:
5828                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5829
5830                # Update variants
5831                log.info(f"Annotation Annovar - Updating...")
5832                self.update_from_vcf(tmp_annotate_vcf_name)
5833
5834            # Clean files
5835            # Tmp file remove command
5836            if True:
5837                tmp_files_remove_command = ""
5838                if tmp_files:
5839                    tmp_files_remove_command = " ".join(tmp_files)
5840                clean_command = f" rm -f {tmp_files_remove_command} "
5841                log.debug(f"Annotation Annovar - Annotation cleaning ")
5842                log.debug(f"Annotation - cleaning command: {clean_command}")
5843                run_parallel_commands([clean_command], 1)

It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations

Parameters
  • threads: number of threads to use
Returns

the value of the variable "return_value".

def annotation_parquet(self, threads: int = None) -> None:
5846    def annotation_parquet(self, threads: int = None) -> None:
5847        """
5848        It takes a VCF file, and annotates it with a parquet file
5849
5850        :param threads: number of threads to use for the annotation
5851        :return: the value of the variable "result".
5852        """
5853
5854        # DEBUG
5855        log.debug("Start annotation with parquet databases")
5856
5857        # Threads
5858        if not threads:
5859            threads = self.get_threads()
5860        log.debug("Threads: " + str(threads))
5861
5862        # DEBUG
5863        delete_tmp = True
5864        if self.get_config().get("verbosity", "warning") in ["debug"]:
5865            delete_tmp = False
5866            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5867
5868        # Config
5869        databases_folders = set(
5870            self.get_config()
5871            .get("folders", {})
5872            .get("databases", {})
5873            .get("annotations", ["."])
5874            + self.get_config()
5875            .get("folders", {})
5876            .get("databases", {})
5877            .get("parquet", ["."])
5878        )
5879        log.debug("Databases annotations: " + str(databases_folders))
5880
5881        # Param
5882        annotations = (
5883            self.get_param()
5884            .get("annotation", {})
5885            .get("parquet", {})
5886            .get("annotations", None)
5887        )
5888        log.debug("Annotations: " + str(annotations))
5889
5890        # Assembly
5891        assembly = self.get_param().get(
5892            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
5893        )
5894
5895        # Force Update Annotation
5896        force_update_annotation = (
5897            self.get_param()
5898            .get("annotation", {})
5899            .get("options", {})
5900            .get("annotations_update", False)
5901        )
5902        log.debug(f"force_update_annotation={force_update_annotation}")
5903        force_append_annotation = (
5904            self.get_param()
5905            .get("annotation", {})
5906            .get("options", {})
5907            .get("annotations_append", False)
5908        )
5909        log.debug(f"force_append_annotation={force_append_annotation}")
5910
5911        # Data
5912        table_variants = self.get_table_variants()
5913
5914        # Check if not empty
5915        log.debug("Check if not empty")
5916        sql_query_chromosomes_df = self.get_query_to_df(
5917            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
5918        )
5919        if not sql_query_chromosomes_df["count"][0]:
5920            log.info(f"VCF empty")
5921            return
5922
5923        # VCF header
5924        vcf_reader = self.get_header()
5925        log.debug("Initial header: " + str(vcf_reader.infos))
5926
5927        # Nb Variants POS
5928        log.debug("NB Variants Start")
5929        nb_variants = self.conn.execute(
5930            f"SELECT count(*) AS count FROM variants"
5931        ).fetchdf()["count"][0]
5932        log.debug("NB Variants Stop")
5933
5934        # Existing annotations
5935        for vcf_annotation in self.get_header().infos:
5936
5937            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5938            log.debug(
5939                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5940            )
5941
5942        # Added columns
5943        added_columns = []
5944
5945        # drop indexes
5946        log.debug(f"Drop indexes...")
5947        self.drop_indexes()
5948
5949        if annotations:
5950
5951            if "ALL" in annotations:
5952
5953                all_param = annotations.get("ALL", {})
5954                all_param_formats = all_param.get("formats", None)
5955                all_param_releases = all_param.get("releases", None)
5956
5957                databases_infos_dict = self.scan_databases(
5958                    database_formats=all_param_formats,
5959                    database_releases=all_param_releases,
5960                )
5961                for database_infos in databases_infos_dict.keys():
5962                    if database_infos not in annotations:
5963                        annotations[database_infos] = {"INFO": None}
5964
5965            for annotation in annotations:
5966
5967                if annotation in ["ALL"]:
5968                    continue
5969
5970                # Annotation Name
5971                annotation_name = os.path.basename(annotation)
5972
5973                # Annotation fields
5974                annotation_fields = annotations[annotation]
5975                if not annotation_fields:
5976                    annotation_fields = {"INFO": None}
5977
5978                log.debug(f"Annotation '{annotation_name}'")
5979                log.debug(
5980                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
5981                )
5982
5983                # Create Database
5984                database = Database(
5985                    database=annotation,
5986                    databases_folders=databases_folders,
5987                    assembly=assembly,
5988                )
5989
5990                # Find files
5991                parquet_file = database.get_database()
5992                parquet_hdr_file = database.get_header_file()
5993                parquet_type = database.get_type()
5994
5995                # Check if files exists
5996                if not parquet_file or not parquet_hdr_file:
5997                    msg_err_list = []
5998                    if not parquet_file:
5999                        msg_err_list.append(
6000                            f"Annotation failed: Annotation file not found"
6001                        )
6002                    if parquet_file and not parquet_hdr_file:
6003                        msg_err_list.append(
6004                            f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'"
6005                        )
6006
6007                    log.error(". ".join(msg_err_list))
6008                    raise ValueError(". ".join(msg_err_list))
6009                else:
6010                    # Get parquet connexion
6011                    parquet_sql_attach = database.get_sql_database_attach(
6012                        output="query"
6013                    )
6014                    if parquet_sql_attach:
6015                        self.conn.execute(parquet_sql_attach)
6016                    parquet_file_link = database.get_sql_database_link()
6017                    # Log
6018                    log.debug(
6019                        f"Annotation '{annotation_name}' - file: "
6020                        + str(parquet_file)
6021                        + " and "
6022                        + str(parquet_hdr_file)
6023                    )
6024
6025                    # Database full header columns
6026                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
6027                        parquet_hdr_file
6028                    )
6029                    # Log
6030                    log.debug(
6031                        "Annotation database header columns : "
6032                        + str(parquet_hdr_vcf_header_columns)
6033                    )
6034
6035                    # Load header as VCF object
6036                    parquet_hdr_vcf_header_infos = database.get_header().infos
6037                    # Log
6038                    log.debug(
6039                        "Annotation database header: "
6040                        + str(parquet_hdr_vcf_header_infos)
6041                    )
6042
6043                    # Get extra infos
6044                    parquet_columns = database.get_extra_columns()
6045                    # Log
6046                    log.debug("Annotation database Columns: " + str(parquet_columns))
6047
6048                    # Add extra columns if "ALL" in annotation_fields
6049                    # if "ALL" in annotation_fields:
6050                    #     allow_add_extra_column = True
6051                    if "ALL" in annotation_fields and database.get_extra_columns():
6052                        for extra_column in database.get_extra_columns():
6053                            if (
6054                                extra_column not in annotation_fields
6055                                and extra_column.replace("INFO/", "")
6056                                not in parquet_hdr_vcf_header_infos
6057                            ):
6058                                parquet_hdr_vcf_header_infos[extra_column] = (
6059                                    vcf.parser._Info(
6060                                        extra_column,
6061                                        ".",
6062                                        "String",
6063                                        f"{extra_column} description",
6064                                        "unknown",
6065                                        "unknown",
6066                                        self.code_type_map["String"],
6067                                    )
6068                                )
6069
6070                    # For all fields in database
6071                    annotation_fields_all = False
6072                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
6073                        annotation_fields_all = True
6074                        annotation_fields = {
6075                            key: key for key in parquet_hdr_vcf_header_infos
6076                        }
6077
6078                        log.debug(
6079                            "Annotation database header - All annotations added: "
6080                            + str(annotation_fields)
6081                        )
6082
6083                    # Init
6084
6085                    # List of annotation fields to use
6086                    sql_query_annotation_update_info_sets = []
6087
6088                    # List of annotation to agregate
6089                    sql_query_annotation_to_agregate = []
6090
6091                    # Number of fields
6092                    nb_annotation_field = 0
6093
6094                    # Annotation fields processed
6095                    annotation_fields_processed = []
6096
6097                    # Columns mapping
6098                    map_columns = database.map_columns(
6099                        columns=annotation_fields, prefixes=["INFO/"]
6100                    )
6101
6102                    # Query dict for fields to remove (update option)
6103                    query_dict_remove = {}
6104
6105                    # Fetch Anotation fields
6106                    for annotation_field in annotation_fields:
6107
6108                        # annotation_field_column
6109                        annotation_field_column = map_columns.get(
6110                            annotation_field, "INFO"
6111                        )
6112
6113                        # field new name, if parametered
6114                        annotation_fields_new_name = annotation_fields.get(
6115                            annotation_field, annotation_field
6116                        )
6117                        if not annotation_fields_new_name:
6118                            annotation_fields_new_name = annotation_field
6119
6120                        # To annotate
6121                        # force_update_annotation = True
6122                        # force_append_annotation = True
6123                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
6124                        if annotation_field in parquet_hdr_vcf_header_infos and (
6125                            force_update_annotation
6126                            or force_append_annotation
6127                            or (
6128                                annotation_fields_new_name
6129                                not in self.get_header().infos
6130                            )
6131                        ):
6132
6133                            # Add field to annotation to process list
6134                            annotation_fields_processed.append(
6135                                annotation_fields_new_name
6136                            )
6137
6138                            # explode infos for the field
6139                            annotation_fields_new_name_info_msg = ""
6140                            if (
6141                                force_update_annotation
6142                                and annotation_fields_new_name
6143                                in self.get_header().infos
6144                            ):
6145                                # Remove field from INFO
6146                                query = f"""
6147                                    UPDATE {table_variants} as table_variants
6148                                    SET INFO = REGEXP_REPLACE(
6149                                                concat(table_variants.INFO,''),
6150                                                ';*{annotation_fields_new_name}=[^;]*',
6151                                                ''
6152                                                )
6153                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
6154                                """
6155                                annotation_fields_new_name_info_msg = " [update]"
6156                                query_dict_remove[
6157                                    f"remove 'INFO/{annotation_fields_new_name}'"
6158                                ] = query
6159
6160                            # Sep between fields in INFO
6161                            nb_annotation_field += 1
6162                            if nb_annotation_field > 1:
6163                                annotation_field_sep = ";"
6164                            else:
6165                                annotation_field_sep = ""
6166
6167                            log.info(
6168                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
6169                            )
6170
6171                            # Add INFO field to header
6172                            parquet_hdr_vcf_header_infos_number = (
6173                                parquet_hdr_vcf_header_infos[annotation_field].num
6174                                or "."
6175                            )
6176                            parquet_hdr_vcf_header_infos_type = (
6177                                parquet_hdr_vcf_header_infos[annotation_field].type
6178                                or "String"
6179                            )
6180                            parquet_hdr_vcf_header_infos_description = (
6181                                parquet_hdr_vcf_header_infos[annotation_field].desc
6182                                or f"{annotation_field} description"
6183                            )
6184                            parquet_hdr_vcf_header_infos_source = (
6185                                parquet_hdr_vcf_header_infos[annotation_field].source
6186                                or "unknown"
6187                            )
6188                            parquet_hdr_vcf_header_infos_version = (
6189                                parquet_hdr_vcf_header_infos[annotation_field].version
6190                                or "unknown"
6191                            )
6192
6193                            vcf_reader.infos[annotation_fields_new_name] = (
6194                                vcf.parser._Info(
6195                                    annotation_fields_new_name,
6196                                    parquet_hdr_vcf_header_infos_number,
6197                                    parquet_hdr_vcf_header_infos_type,
6198                                    parquet_hdr_vcf_header_infos_description,
6199                                    parquet_hdr_vcf_header_infos_source,
6200                                    parquet_hdr_vcf_header_infos_version,
6201                                    self.code_type_map[
6202                                        parquet_hdr_vcf_header_infos_type
6203                                    ],
6204                                )
6205                            )
6206
6207                            # Append
6208                            if force_append_annotation:
6209                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
6210                            else:
6211                                query_case_when_append = ""
6212
6213                            # Annotation/Update query fields
6214                            # Found in INFO column
6215                            if (
6216                                annotation_field_column == "INFO"
6217                                and "INFO" in parquet_hdr_vcf_header_columns
6218                            ):
6219                                sql_query_annotation_update_info_sets.append(
6220                                    f"""
6221                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
6222                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
6223                                        ELSE ''
6224                                    END
6225                                """
6226                                )
6227                            # Found in a specific column
6228                            else:
6229                                sql_query_annotation_update_info_sets.append(
6230                                    f"""
6231                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
6232                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
6233                                        ELSE ''
6234                                    END
6235                                """
6236                                )
6237                                sql_query_annotation_to_agregate.append(
6238                                    f""" string_agg(table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
6239                                )
6240
6241                        # Not to annotate
6242                        else:
6243
6244                            if force_update_annotation:
6245                                annotation_message = "forced"
6246                            else:
6247                                annotation_message = "skipped"
6248
6249                            if annotation_field not in parquet_hdr_vcf_header_infos:
6250                                log.warning(
6251                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
6252                                )
6253                            if annotation_fields_new_name in self.get_header().infos:
6254                                log.warning(
6255                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
6256                                )
6257
6258                    # Check if ALL fields have to be annotated. Thus concat all INFO field
6259                    # allow_annotation_full_info = True
6260                    allow_annotation_full_info = not force_append_annotation
6261
6262                    if parquet_type in ["regions"]:
6263                        allow_annotation_full_info = False
6264
6265                    if (
6266                        allow_annotation_full_info
6267                        and nb_annotation_field == len(annotation_fields)
6268                        and annotation_fields_all
6269                        and (
6270                            "INFO" in parquet_hdr_vcf_header_columns
6271                            and "INFO" in database.get_extra_columns()
6272                        )
6273                    ):
6274                        log.debug("Column INFO annotation enabled")
6275                        sql_query_annotation_update_info_sets = []
6276                        sql_query_annotation_update_info_sets.append(
6277                            f" table_parquet.INFO "
6278                        )
6279
6280                    if sql_query_annotation_update_info_sets:
6281
6282                        # Annotate
6283                        log.info(f"Annotation '{annotation_name}' - Annotation...")
6284
6285                        # Join query annotation update info sets for SQL
6286                        sql_query_annotation_update_info_sets_sql = ",".join(
6287                            sql_query_annotation_update_info_sets
6288                        )
6289
6290                        # Check chromosomes list (and variants infos)
6291                        sql_query_chromosomes = f"""
6292                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
6293                            FROM {table_variants} as table_variants
6294                            GROUP BY table_variants."#CHROM"
6295                            ORDER BY table_variants."#CHROM"
6296                            """
6297                        sql_query_chromosomes_df = self.conn.execute(
6298                            sql_query_chromosomes
6299                        ).df()
6300                        sql_query_chromosomes_dict = {
6301                            entry["CHROM"]: {
6302                                "count": entry["count_variants"],
6303                                "min": entry["min_variants"],
6304                                "max": entry["max_variants"],
6305                            }
6306                            for index, entry in sql_query_chromosomes_df.iterrows()
6307                        }
6308
6309                        # Init
6310                        nb_of_query = 0
6311                        nb_of_variant_annotated = 0
6312                        query_dict = query_dict_remove
6313
6314                        # for chrom in sql_query_chromosomes_df["CHROM"]:
6315                        for chrom in sql_query_chromosomes_dict:
6316
6317                            # Number of variant by chromosome
6318                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
6319                                chrom, {}
6320                            ).get("count", 0)
6321
6322                            log.debug(
6323                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
6324                            )
6325
6326                            # Annotation with regions database
6327                            if parquet_type in ["regions"]:
6328                                sql_query_annotation_from_clause = f"""
6329                                    FROM (
6330                                        SELECT 
6331                                            '{chrom}' AS \"#CHROM\",
6332                                            table_variants_from.\"POS\" AS \"POS\",
6333                                            {",".join(sql_query_annotation_to_agregate)}
6334                                        FROM {table_variants} as table_variants_from
6335                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
6336                                            table_parquet_from."#CHROM" = '{chrom}'
6337                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
6338                                            AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
6339                                        )
6340                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
6341                                        GROUP BY table_variants_from.\"POS\"
6342                                        )
6343                                        as table_parquet
6344                                """
6345
6346                                sql_query_annotation_where_clause = """
6347                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
6348                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
6349                                """
6350
6351                            # Annotation with variants database
6352                            else:
6353                                sql_query_annotation_from_clause = f"""
6354                                    FROM {parquet_file_link} as table_parquet
6355                                """
6356                                sql_query_annotation_where_clause = f"""
6357                                    table_variants."#CHROM" = '{chrom}'
6358                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
6359                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
6360                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
6361                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
6362                                """
6363
6364                            # Create update query
6365                            sql_query_annotation_chrom_interval_pos = f"""
6366                                UPDATE {table_variants} as table_variants
6367                                    SET INFO = 
6368                                        concat(
6369                                            CASE WHEN table_variants.INFO NOT IN ('','.')
6370                                                THEN table_variants.INFO
6371                                                ELSE ''
6372                                            END
6373                                            ,
6374                                            CASE WHEN table_variants.INFO NOT IN ('','.')
6375                                                        AND (
6376                                                        concat({sql_query_annotation_update_info_sets_sql})
6377                                                        )
6378                                                        NOT IN ('','.') 
6379                                                    THEN ';'
6380                                                    ELSE ''
6381                                            END
6382                                            ,
6383                                            {sql_query_annotation_update_info_sets_sql}
6384                                            )
6385                                    {sql_query_annotation_from_clause}
6386                                    WHERE {sql_query_annotation_where_clause}
6387                                    ;
6388                                """
6389
6390                            # Add update query to dict
6391                            query_dict[
6392                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
6393                            ] = sql_query_annotation_chrom_interval_pos
6394
6395                        nb_of_query = len(query_dict)
6396                        num_query = 0
6397
6398                        # SET max_expression_depth TO x
6399                        self.conn.execute("SET max_expression_depth TO 10000")
6400
6401                        for query_name in query_dict:
6402                            query = query_dict[query_name]
6403                            num_query += 1
6404                            log.info(
6405                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
6406                            )
6407                            result = self.conn.execute(query)
6408                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
6409                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
6410                            log.info(
6411                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
6412                            )
6413
6414                        log.info(
6415                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
6416                        )
6417
6418                    else:
6419
6420                        log.info(
6421                            f"Annotation '{annotation_name}' - No Annotations available"
6422                        )
6423
6424                    log.debug("Final header: " + str(vcf_reader.infos))
6425
6426        # Remove added columns
6427        for added_column in added_columns:
6428            self.drop_column(column=added_column)

It takes a VCF file, and annotates it with a parquet file

Parameters
  • threads: number of threads to use for the annotation
Returns

the value of the variable "result".

def annotation_splice(self, threads: int = None) -> None:
6430    def annotation_splice(self, threads: int = None) -> None:
6431        """
6432        This function annotate with snpEff
6433
6434        :param threads: The number of threads to use
6435        :return: the value of the variable "return_value".
6436        """
6437
6438        # DEBUG
6439        log.debug("Start annotation with splice tools")
6440
6441        # Threads
6442        if not threads:
6443            threads = self.get_threads()
6444        log.debug("Threads: " + str(threads))
6445
6446        # DEBUG
6447        delete_tmp = True
6448        if self.get_config().get("verbosity", "warning") in ["debug"]:
6449            delete_tmp = False
6450            log.debug("Delete tmp files/folders: " + str(delete_tmp))
6451
6452        # Config
6453        config = self.get_config()
6454        log.debug("Config: " + str(config))
6455        splice_config = config.get("tools", {}).get("splice", {})
6456        if not splice_config:
6457            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
6458            msg_err = "No Splice tool config"
6459            raise ValueError(msg_err)
6460        log.debug(f"splice_config: {splice_config}")
6461
6462        # Config - Folders - Databases
6463        databases_folders = (
6464            config.get("folders", {}).get("databases", {}).get("splice", ["."])
6465        )
6466        log.debug("Databases annotations: " + str(databases_folders))
6467
6468        # Splice docker image
6469        splice_docker_image = splice_config.get("docker").get("image")
6470
6471        # Pull splice image if it's not already there
6472        if not check_docker_image_exists(splice_docker_image):
6473            log.warning(
6474                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
6475            )
6476            try:
6477                command(f"docker pull {splice_config.get('docker').get('image')}")
6478            except subprocess.CalledProcessError:
6479                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
6480                log.error(msg_err)
6481                raise ValueError(msg_err)
6482
6483        # Config - splice databases
6484        splice_databases = (
6485            config.get("folders", {})
6486            .get("databases", {})
6487            .get("splice", DEFAULT_SPLICE_FOLDER)
6488        )
6489        splice_databases = full_path(splice_databases)
6490
6491        # Param
6492        param = self.get_param()
6493        log.debug("Param: " + str(param))
6494
6495        # Param
6496        options = param.get("annotation", {}).get("splice", {}).get("options", {})
6497        log.debug("Options: " + str(options))
6498
6499        # Data
6500        table_variants = self.get_table_variants()
6501
6502        # Check if not empty
6503        log.debug("Check if not empty")
6504        sql_query_chromosomes = (
6505            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
6506        )
6507        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
6508            log.info("VCF empty")
6509            return None
6510
6511        # Export in VCF
6512        log.debug("Create initial file to annotate")
6513
6514        # Create output folder / work folder
6515        if options.get("output_folder", ""):
6516            output_folder = options.get("output_folder", "")
6517            if not os.path.exists(output_folder):
6518                Path(output_folder).mkdir(parents=True, exist_ok=True)
6519        else:
6520            output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
6521            if not os.path.exists(output_folder):
6522                Path(output_folder).mkdir(parents=True, exist_ok=True)
6523
6524        if options.get("workdir", ""):
6525            workdir = options.get("workdir", "")
6526        else:
6527            workdir = "/work"
6528
6529        # Create tmp VCF file
6530        tmp_vcf = NamedTemporaryFile(
6531            prefix=self.get_prefix(),
6532            dir=output_folder,
6533            suffix=".vcf",
6534            delete=False,
6535        )
6536        tmp_vcf_name = tmp_vcf.name
6537
6538        # VCF header
6539        header = self.get_header()
6540
6541        # Existing annotations
6542        for vcf_annotation in self.get_header().infos:
6543
6544            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
6545            log.debug(
6546                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
6547            )
6548
6549        # Memory limit
6550        if config.get("memory", None):
6551            memory_limit = config.get("memory", "8G").upper()
6552            # upper()
6553        else:
6554            memory_limit = "8G"
6555        log.debug(f"memory_limit: {memory_limit}")
6556
6557        # Check number of variants to annotate
6558        where_clause_regex_spliceai = r"SpliceAI_\w+"
6559        where_clause_regex_spip = r"SPiP_\w+"
6560        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
6561        df_list_of_variants_to_annotate = self.get_query_to_df(
6562            query=f""" SELECT * FROM variants {where_clause} """
6563        )
6564        if len(df_list_of_variants_to_annotate) == 0:
6565            log.warning(
6566                f"No variants to annotate with splice. Variants probably already annotated with splice"
6567            )
6568            return None
6569        else:
6570            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
6571
6572        # Export VCF file
6573        self.export_variant_vcf(
6574            vcf_file=tmp_vcf_name,
6575            remove_info=True,
6576            add_samples=True,
6577            index=False,
6578            where_clause=where_clause,
6579        )
6580        mount = [f" -v {path}:{path}:rw" for path in [output_folder]]
6581        if any(value for value in splice_config.values() if value is None):
6582            log.warning("At least one splice config parameter is empty")
6583            # exit annotation_splice
6584            return None
6585
6586        # Params in splice nf
6587        def check_values(dico: dict):
6588            """
6589            Ensure parameters for NF splice pipeline
6590            """
6591            for key, val in dico.items():
6592                if key == "genome":
6593                    if any(
6594                        assemb in options.get("genome", {})
6595                        for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
6596                    ):
6597                        yield f"--{key} hg19"
6598                    elif any(
6599                        assemb in options.get("genome", {})
6600                        for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
6601                    ):
6602                        yield f"--{key} hg38"
6603                elif (
6604                    (isinstance(val, str) and val)
6605                    or isinstance(val, int)
6606                    or isinstance(val, bool)
6607                ):
6608                    yield f"--{key} {val}"
6609
6610        # Genome
6611        genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
6612        options["genome"] = genome
6613        # NF params
6614        nf_params = []
6615        # Add options
6616        if options:
6617            log.debug(options)
6618            nf_params = list(check_values(options))
6619            log.debug(f"Splice NF params: {' '.join(nf_params)}")
6620        else:
6621            log.debug("No NF params provided")
6622        # Add threads
6623        if "threads" not in options.keys():
6624            nf_params.append(f"--threads {threads}")
6625        # Genome path
6626        genome_path = find_genome(
6627            config.get("folders", {})
6628            .get("databases", {})
6629            .get("genomes", DEFAULT_GENOME_FOLDER),
6630            file=f"{genome}.fa",
6631        )
6632        # Add genome path
6633        if not genome_path:
6634            raise ValueError(
6635                f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
6636            )
6637        else:
6638            log.debug(f"Genome: {genome_path}")
6639            nf_params.append(f"--genome_path {genome_path}")
6640
6641        def splice_annotations(options: dict = {}, config: dict = {}) -> list:
6642            """
6643            Setting up updated databases for SPiP and SpliceAI
6644            """
6645
6646            try:
6647
6648                # SpliceAI assembly transcriptome
6649                spliceai_assembly = os.path.join(
6650                    config.get("folders", {}).get("databases", {}).get("spliceai", {}),
6651                    options.get("genome"),
6652                    "transcriptome",
6653                )
6654                spip_assembly = options.get("genome")
6655
6656                spip = find(
6657                    f"transcriptome_{spip_assembly}.RData",
6658                    config.get("folders", {}).get("databases", {}).get("spip", {}),
6659                )
6660                spliceai = find("spliceai.refseq.txt", spliceai_assembly)
6661                log.debug(f"SPiP annotations: {spip}")
6662                log.debug(f"SpliceAI annotations: {spliceai}")
6663                if spip and spliceai:
6664                    return [
6665                        f"--spip_transcriptome {spip}",
6666                        f"--spliceai_transcriptome {spliceai}",
6667                    ]
6668                else:
6669                    log.warning(
6670                        "Can't find splice databases in configuration, use annotations file from image"
6671                    )
6672            except TypeError:
6673                log.warning(
6674                    "Can't find splice databases in configuration, use annotations file from image"
6675                )
6676                return []
6677
6678        # Add options, check if transcriptome option have already beend provided
6679        if (
6680            "spip_transcriptome" not in nf_params
6681            and "spliceai_transcriptome" not in nf_params
6682        ):
6683            splice_reference = splice_annotations(options, config)
6684            if splice_reference:
6685                nf_params.extend(splice_reference)
6686        # nf_params.append(f"--output_folder {output_folder}")
6687        random_uuid = f"HOWARD-SPLICE-{get_random()}"
6688        cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
6689        log.debug(cmd)
6690        splice_config["docker"]["command"] = cmd
6691
6692        # Ensure proxy is set
6693        proxy = [
6694            f"-e {var}={os.getenv(var)}"
6695            for var in ["https_proxy", "http_proxy", "ftp_proxy"]
6696            if os.getenv(var) is not None
6697        ]
6698        docker_cmd = get_bin_command(
6699            tool="splice",
6700            bin_type="docker",
6701            config=config,
6702            default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
6703            add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}",
6704        )
6705        # print(docker_cmd)
6706        # exit()
6707        # Docker debug
6708        # if splice_config.get("rm_container"):
6709        #     rm_container = "--rm"
6710        # else:
6711        #     rm_container = ""
6712        # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
6713        log.debug(docker_cmd)
6714        res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
6715        log.debug(res.stdout)
6716        if res.stderr:
6717            log.error(res.stderr)
6718        res.check_returncode()
6719        # Update variants
6720        log.info("Annotation - Updating...")
6721        # Test find output vcf
6722        log.debug(
6723            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6724        )
6725        output_vcf = []
6726        # Wrong folder to look in
6727        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
6728            if (
6729                files
6730                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6731            ):
6732                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
6733        # log.debug(os.listdir(options.get("output_folder")))
6734        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
6735        if not output_vcf:
6736            log.debug(
6737                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
6738            )
6739        else:
6740            # Get new header from annotated vcf
6741            log.debug(f"Initial header: {len(header.infos)} fields")
6742            # Create new header with splice infos
6743            new_vcf = Variants(input=output_vcf[0])
6744            new_vcf_header = new_vcf.get_header().infos
6745            for keys, infos in new_vcf_header.items():
6746                if keys not in header.infos.keys():
6747                    header.infos[keys] = infos
6748            log.debug(f"New header: {len(header.infos)} fields")
6749            log.debug(f"Splice tmp output: {output_vcf[0]}")
6750            self.update_from_vcf(output_vcf[0])
6751
6752        # Remove file
6753        remove_if_exists(output_vcf)

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def get_config_default(self, name: str) -> dict:
6759    def get_config_default(self, name: str) -> dict:
6760        """
6761        The function `get_config_default` returns a dictionary containing default configurations for
6762        various calculations and prioritizations.
6763
6764        :param name: The `get_config_default` function returns a dictionary containing default
6765        configurations for different calculations and prioritizations. The `name` parameter is used to
6766        specify which specific configuration to retrieve from the dictionary
6767        :type name: str
6768        :return: The function `get_config_default` returns a dictionary containing default configuration
6769        settings for different calculations and prioritizations. The specific configuration settings are
6770        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
6771        matches a key in the `config_default` dictionary, the corresponding configuration settings are
6772        returned. If there is no match, an empty dictionary is returned.
6773        """
6774
6775        config_default = {
6776            "calculations": {
6777                "variant_chr_pos_alt_ref": {
6778                    "type": "sql",
6779                    "name": "variant_chr_pos_alt_ref",
6780                    "description": "Create a variant ID with chromosome, position, alt and ref",
6781                    "available": False,
6782                    "output_column_name": "variant_chr_pos_alt_ref",
6783                    "output_column_type": "String",
6784                    "output_column_description": "variant ID with chromosome, position, alt and ref",
6785                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
6786                    "operation_info": True,
6787                },
6788                "VARTYPE": {
6789                    "type": "sql",
6790                    "name": "VARTYPE",
6791                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
6792                    "available": True,
6793                    "table": "variants",
6794                    "output_column_name": "VARTYPE",
6795                    "output_column_type": "String",
6796                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
6797                    "operation_query": """
6798                            CASE
6799                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
6800                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
6801                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
6802                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
6803                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
6804                                ELSE 'UNDEFINED'
6805                            END
6806                            """,
6807                    "info_fields": ["SVTYPE"],
6808                    "operation_info": True,
6809                },
6810                "snpeff_hgvs": {
6811                    "type": "python",
6812                    "name": "snpeff_hgvs",
6813                    "description": "HGVS nomenclatures from snpEff annotation",
6814                    "available": True,
6815                    "function_name": "calculation_extract_snpeff_hgvs",
6816                    "function_params": ["snpeff_hgvs", "ANN"],
6817                },
6818                "snpeff_ann_explode": {
6819                    "type": "python",
6820                    "name": "snpeff_ann_explode",
6821                    "description": "Explode snpEff annotations with uniquify values",
6822                    "available": True,
6823                    "function_name": "calculation_snpeff_ann_explode",
6824                    "function_params": [False, "fields", "snpeff_", "ANN"],
6825                },
6826                "snpeff_ann_explode_uniquify": {
6827                    "type": "python",
6828                    "name": "snpeff_ann_explode_uniquify",
6829                    "description": "Explode snpEff annotations",
6830                    "available": True,
6831                    "function_name": "calculation_snpeff_ann_explode",
6832                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
6833                },
6834                "snpeff_ann_explode_json": {
6835                    "type": "python",
6836                    "name": "snpeff_ann_explode_json",
6837                    "description": "Explode snpEff annotations in JSON format",
6838                    "available": True,
6839                    "function_name": "calculation_snpeff_ann_explode",
6840                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
6841                },
6842                "NOMEN": {
6843                    "type": "python",
6844                    "name": "NOMEN",
6845                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)",
6846                    "available": True,
6847                    "function_name": "calculation_extract_nomen",
6848                    "function_params": [],
6849                },
6850                "RENAME_INFO_FIELDS": {
6851                    "type": "python",
6852                    "name": "RENAME_INFO_FIELDS",
6853                    "description": "Rename or remove INFO/tags",
6854                    "available": True,
6855                    "function_name": "calculation_rename_info_fields",
6856                    "function_params": [],
6857                },
6858                "FINDBYPIPELINE": {
6859                    "type": "python",
6860                    "name": "FINDBYPIPELINE",
6861                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
6862                    "available": True,
6863                    "function_name": "calculation_find_by_pipeline",
6864                    "function_params": ["findbypipeline"],
6865                },
6866                "FINDBYSAMPLE": {
6867                    "type": "python",
6868                    "name": "FINDBYSAMPLE",
6869                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
6870                    "available": True,
6871                    "function_name": "calculation_find_by_pipeline",
6872                    "function_params": ["findbysample"],
6873                },
6874                "GENOTYPECONCORDANCE": {
6875                    "type": "python",
6876                    "name": "GENOTYPECONCORDANCE",
6877                    "description": "Concordance of genotype for multi caller VCF",
6878                    "available": True,
6879                    "function_name": "calculation_genotype_concordance",
6880                    "function_params": [],
6881                },
6882                "BARCODE": {
6883                    "type": "python",
6884                    "name": "BARCODE",
6885                    "description": "BARCODE as VaRank tool",
6886                    "available": True,
6887                    "function_name": "calculation_barcode",
6888                    "function_params": [],
6889                },
6890                "BARCODEFAMILY": {
6891                    "type": "python",
6892                    "name": "BARCODEFAMILY",
6893                    "description": "BARCODEFAMILY as VaRank tool",
6894                    "available": True,
6895                    "function_name": "calculation_barcode_family",
6896                    "function_params": ["BCF"],
6897                },
6898                "TRIO": {
6899                    "type": "python",
6900                    "name": "TRIO",
6901                    "description": "Inheritance for a trio family",
6902                    "available": True,
6903                    "function_name": "calculation_trio",
6904                    "function_params": [],
6905                },
6906                "VAF": {
6907                    "type": "python",
6908                    "name": "VAF",
6909                    "description": "Variant Allele Frequency (VAF) harmonization",
6910                    "available": True,
6911                    "function_name": "calculation_vaf_normalization",
6912                    "function_params": [],
6913                },
6914                "VAF_stats": {
6915                    "type": "python",
6916                    "name": "VAF_stats",
6917                    "description": "Variant Allele Frequency (VAF) statistics",
6918                    "available": True,
6919                    "function_name": "calculation_genotype_stats",
6920                    "function_params": ["VAF"],
6921                },
6922                "DP_stats": {
6923                    "type": "python",
6924                    "name": "DP_stats",
6925                    "description": "Depth (DP) statistics",
6926                    "available": True,
6927                    "function_name": "calculation_genotype_stats",
6928                    "function_params": ["DP"],
6929                },
6930                "variant_id": {
6931                    "type": "python",
6932                    "name": "variant_id",
6933                    "description": "Variant ID generated from variant position and type",
6934                    "available": True,
6935                    "function_name": "calculation_variant_id",
6936                    "function_params": [],
6937                },
6938                "transcripts_json": {
6939                    "type": "python",
6940                    "name": "transcripts_json",
6941                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
6942                    "available": True,
6943                    "function_name": "calculation_transcripts_annotation",
6944                    "function_params": ["transcripts_json", None],
6945                },
6946                "transcripts_ann": {
6947                    "type": "python",
6948                    "name": "transcripts_ann",
6949                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
6950                    "available": True,
6951                    "function_name": "calculation_transcripts_annotation",
6952                    "function_params": [None, "transcripts_ann"],
6953                },
6954                "transcripts_annotations": {
6955                    "type": "python",
6956                    "name": "transcripts_annotations",
6957                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
6958                    "available": True,
6959                    "function_name": "calculation_transcripts_annotation",
6960                    "function_params": [None, None],
6961                },
6962                "transcripts_prioritization": {
6963                    "type": "python",
6964                    "name": "transcripts_prioritization",
6965                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
6966                    "available": True,
6967                    "function_name": "calculation_transcripts_prioritization",
6968                    "function_params": [],
6969                },
6970                "transcripts_export": {
6971                    "type": "python",
6972                    "name": "transcripts_export",
6973                    "description": "Export transcripts table/view as a file (using param.json)",
6974                    "available": True,
6975                    "function_name": "calculation_transcripts_export",
6976                    "function_params": [],
6977                },
6978            },
6979            "prioritizations": {
6980                "default": {
6981                    "ANN2": [
6982                        {
6983                            "type": "contains",
6984                            "value": "HIGH",
6985                            "score": 5,
6986                            "flag": "PASS",
6987                            "comment": [
6988                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
6989                            ],
6990                        },
6991                        {
6992                            "type": "contains",
6993                            "value": "MODERATE",
6994                            "score": 3,
6995                            "flag": "PASS",
6996                            "comment": [
6997                                "A non-disruptive variant that might change protein effectiveness"
6998                            ],
6999                        },
7000                        {
7001                            "type": "contains",
7002                            "value": "LOW",
7003                            "score": 0,
7004                            "flag": "FILTERED",
7005                            "comment": [
7006                                "Assumed to be mostly harmless or unlikely to change protein behavior"
7007                            ],
7008                        },
7009                        {
7010                            "type": "contains",
7011                            "value": "MODIFIER",
7012                            "score": 0,
7013                            "flag": "FILTERED",
7014                            "comment": [
7015                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
7016                            ],
7017                        },
7018                    ],
7019                }
7020            },
7021        }
7022
7023        return config_default.get(name, None)

The function get_config_default returns a dictionary containing default configurations for various calculations and prioritizations.

Parameters
  • name: The get_config_default function returns a dictionary containing default configurations for different calculations and prioritizations. The name parameter is used to specify which specific configuration to retrieve from the dictionary
Returns

The function get_config_default returns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the input name parameter provided to the function. If the name parameter matches a key in the config_default dictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.

def get_config_json(self, name: str, config_dict: dict = {}, config_file: str = None) -> dict:
7025    def get_config_json(
7026        self, name: str, config_dict: dict = {}, config_file: str = None
7027    ) -> dict:
7028        """
7029        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
7030        default values, a dictionary, and a file.
7031
7032        :param name: The `name` parameter in the `get_config_json` function is a string that represents
7033        the name of the configuration. It is used to identify and retrieve the configuration settings
7034        for a specific component or module
7035        :type name: str
7036        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
7037        dictionary that allows you to provide additional configuration settings or overrides. When you
7038        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
7039        the key is the configuration setting you want to override or
7040        :type config_dict: dict
7041        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
7042        specify the path to a configuration file that contains additional settings. If provided, the
7043        function will read the contents of this file and update the configuration dictionary with the
7044        values found in the file, overriding any existing values with the
7045        :type config_file: str
7046        :return: The function `get_config_json` returns a dictionary containing the configuration
7047        settings.
7048        """
7049
7050        # Create with default prioritizations
7051        config_default = self.get_config_default(name=name)
7052        configuration = config_default
7053        # log.debug(f"configuration={configuration}")
7054
7055        # Replace prioritizations from dict
7056        for config in config_dict:
7057            configuration[config] = config_dict[config]
7058
7059        # Replace prioritizations from file
7060        config_file = full_path(config_file)
7061        if config_file:
7062            if os.path.exists(config_file):
7063                with open(config_file) as config_file_content:
7064                    config_file_dict = yaml.safe_load(config_file_content)
7065                for config in config_file_dict:
7066                    configuration[config] = config_file_dict[config]
7067            else:
7068                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
7069                log.error(msg_error)
7070                raise ValueError(msg_error)
7071
7072        return configuration

The function get_config_json retrieves a configuration JSON object with prioritizations from default values, a dictionary, and a file.

Parameters
  • name: The name parameter in the get_config_json function is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module
  • config_dict: The config_dict parameter in the get_config_json function is a dictionary that allows you to provide additional configuration settings or overrides. When you call the get_config_json function, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or
  • config_file: The config_file parameter in the get_config_json function is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns

The function get_config_json returns a dictionary containing the configuration settings.

def prioritization( self, table: str = None, pz_prefix: str = None, pz_param: dict = None) -> bool:
7074    def prioritization(
7075        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
7076    ) -> bool:
7077        """
7078        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
7079        prioritizes variants based on configured profiles and criteria.
7080
7081        :param table: The `table` parameter in the `prioritization` function is used to specify the name
7082        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
7083        a table name is provided, the method will prioritize the variants in that specific table
7084        :type table: str
7085        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
7086        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
7087        provided, the code will use a default prefix value of "PZ"
7088        :type pz_prefix: str
7089        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
7090        additional parameters specific to the prioritization process. These parameters can include
7091        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
7092        configurations needed for the prioritization of variants in a V
7093        :type pz_param: dict
7094        :return: A boolean value (True) is being returned from the `prioritization` function.
7095        """
7096
7097        # Config
7098        config = self.get_config()
7099
7100        # Param
7101        param = self.get_param()
7102
7103        # Prioritization param
7104        if pz_param is not None:
7105            prioritization_param = pz_param
7106        else:
7107            prioritization_param = param.get("prioritization", {})
7108
7109        # Configuration profiles
7110        prioritization_config_file = prioritization_param.get(
7111            "prioritization_config", None
7112        )
7113        prioritization_config_file = full_path(prioritization_config_file)
7114        prioritizations_config = self.get_config_json(
7115            name="prioritizations", config_file=prioritization_config_file
7116        )
7117
7118        # Prioritization prefix
7119        pz_prefix_default = "PZ"
7120        if pz_prefix is None:
7121            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
7122
7123        # Prioritization options
7124        profiles = prioritization_param.get("profiles", [])
7125        if isinstance(profiles, str):
7126            profiles = profiles.split(",")
7127        pzfields = prioritization_param.get(
7128            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
7129        )
7130        if isinstance(pzfields, str):
7131            pzfields = pzfields.split(",")
7132        default_profile = prioritization_param.get("default_profile", None)
7133        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
7134        prioritization_score_mode = prioritization_param.get(
7135            "prioritization_score_mode", "HOWARD"
7136        )
7137
7138        # Quick Prioritizations
7139        prioritizations = param.get("prioritizations", None)
7140        if prioritizations:
7141            log.info("Quick Prioritization:")
7142            for profile in prioritizations.split(","):
7143                if profile not in profiles:
7144                    profiles.append(profile)
7145                    log.info(f"   {profile}")
7146
7147        # If profile "ALL" provided, all profiles in the config profiles
7148        if "ALL" in profiles:
7149            profiles = list(prioritizations_config.keys())
7150
7151        for profile in profiles:
7152            if prioritizations_config.get(profile, None):
7153                log.debug(f"Profile '{profile}' configured")
7154            else:
7155                msg_error = f"Profile '{profile}' NOT configured"
7156                log.error(msg_error)
7157                raise ValueError(msg_error)
7158
7159        if profiles:
7160            log.info(f"Prioritization... ")
7161        else:
7162            log.debug(f"No profile defined")
7163            return False
7164
7165        if not default_profile and len(profiles):
7166            default_profile = profiles[0]
7167
7168        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
7169        log.debug("Profiles to check: " + str(list(profiles)))
7170
7171        # Variables
7172        if table is not None:
7173            table_variants = table
7174        else:
7175            table_variants = self.get_table_variants(clause="update")
7176        log.debug(f"Table to prioritize: {table_variants}")
7177
7178        # Added columns
7179        added_columns = []
7180
7181        # Create list of PZfields
7182        # List of PZFields
7183        list_of_pzfields_original = pzfields + [
7184            pzfield + pzfields_sep + profile
7185            for pzfield in pzfields
7186            for profile in profiles
7187        ]
7188        list_of_pzfields = []
7189        log.debug(f"{list_of_pzfields_original}")
7190
7191        # Remove existing PZfields to use if exists
7192        for pzfield in list_of_pzfields_original:
7193            if self.get_header().infos.get(pzfield, None) is None:
7194                list_of_pzfields.append(pzfield)
7195                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
7196            else:
7197                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
7198
7199        if list_of_pzfields:
7200
7201            # Explode Infos prefix
7202            explode_infos_prefix = self.get_explode_infos_prefix()
7203
7204            # PZfields tags description
7205            PZfields_INFOS = {
7206                f"{pz_prefix}Tags": {
7207                    "ID": f"{pz_prefix}Tags",
7208                    "Number": ".",
7209                    "Type": "String",
7210                    "Description": "Variant tags based on annotation criteria",
7211                },
7212                f"{pz_prefix}Score": {
7213                    "ID": f"{pz_prefix}Score",
7214                    "Number": 1,
7215                    "Type": "Integer",
7216                    "Description": "Variant score based on annotation criteria",
7217                },
7218                f"{pz_prefix}Flag": {
7219                    "ID": f"{pz_prefix}Flag",
7220                    "Number": 1,
7221                    "Type": "String",
7222                    "Description": "Variant flag based on annotation criteria",
7223                },
7224                f"{pz_prefix}Comment": {
7225                    "ID": f"{pz_prefix}Comment",
7226                    "Number": ".",
7227                    "Type": "String",
7228                    "Description": "Variant comment based on annotation criteria",
7229                },
7230                f"{pz_prefix}Infos": {
7231                    "ID": f"{pz_prefix}Infos",
7232                    "Number": ".",
7233                    "Type": "String",
7234                    "Description": "Variant infos based on annotation criteria",
7235                },
7236                f"{pz_prefix}Class": {
7237                    "ID": f"{pz_prefix}Class",
7238                    "Number": ".",
7239                    "Type": "String",
7240                    "Description": "Variant class based on annotation criteria",
7241                },
7242            }
7243
7244            # Create INFO fields if not exist
7245            for field in PZfields_INFOS:
7246                field_ID = PZfields_INFOS[field]["ID"]
7247                field_description = PZfields_INFOS[field]["Description"]
7248                if field_ID not in self.get_header().infos and field_ID in pzfields:
7249                    field_description = (
7250                        PZfields_INFOS[field]["Description"]
7251                        + f", profile {default_profile}"
7252                    )
7253                    self.get_header().infos[field_ID] = vcf.parser._Info(
7254                        field_ID,
7255                        PZfields_INFOS[field]["Number"],
7256                        PZfields_INFOS[field]["Type"],
7257                        field_description,
7258                        "unknown",
7259                        "unknown",
7260                        code_type_map[PZfields_INFOS[field]["Type"]],
7261                    )
7262
7263            # Create INFO fields if not exist for each profile
7264            for profile in prioritizations_config:
7265                if profile in profiles or profiles == []:
7266                    for field in PZfields_INFOS:
7267                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
7268                        field_description = (
7269                            PZfields_INFOS[field]["Description"]
7270                            + f", profile {profile}"
7271                        )
7272                        if (
7273                            field_ID not in self.get_header().infos
7274                            and field in pzfields
7275                        ):
7276                            self.get_header().infos[field_ID] = vcf.parser._Info(
7277                                field_ID,
7278                                PZfields_INFOS[field]["Number"],
7279                                PZfields_INFOS[field]["Type"],
7280                                field_description,
7281                                "unknown",
7282                                "unknown",
7283                                code_type_map[PZfields_INFOS[field]["Type"]],
7284                            )
7285
7286            # Header
7287            for pzfield in list_of_pzfields:
7288                if re.match(f"{pz_prefix}Score.*", pzfield):
7289                    added_column = self.add_column(
7290                        table_name=table_variants,
7291                        column_name=pzfield,
7292                        column_type="INTEGER",
7293                        default_value="0",
7294                    )
7295                elif re.match(f"{pz_prefix}Flag.*", pzfield):
7296                    added_column = self.add_column(
7297                        table_name=table_variants,
7298                        column_name=pzfield,
7299                        column_type="BOOLEAN",
7300                        default_value="1",
7301                    )
7302                elif re.match(f"{pz_prefix}Class.*", pzfield):
7303                    added_column = self.add_column(
7304                        table_name=table_variants,
7305                        column_name=pzfield,
7306                        column_type="VARCHAR[]",
7307                        default_value="null",
7308                    )
7309                else:
7310                    added_column = self.add_column(
7311                        table_name=table_variants,
7312                        column_name=pzfield,
7313                        column_type="STRING",
7314                        default_value="''",
7315                    )
7316                added_columns.append(added_column)
7317
7318            # Profiles
7319            if profiles:
7320
7321                # foreach profile in configuration file
7322                for profile in prioritizations_config:
7323
7324                    # If profile is asked in param, or ALL are asked (empty profile [])
7325                    if profile in profiles or profiles == []:
7326                        log.info(f"Profile '{profile}'")
7327
7328                        sql_set_info_option = ""
7329
7330                        sql_set_info = []
7331
7332                        # PZ fields set
7333
7334                        # PZScore
7335                        if (
7336                            f"{pz_prefix}Score{pzfields_sep}{profile}"
7337                            in list_of_pzfields
7338                        ):
7339                            sql_set_info.append(
7340                                f"""
7341                                    concat(
7342                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
7343                                        {pz_prefix}Score{pzfields_sep}{profile}
7344                                    ) 
7345                                """
7346                            )
7347                            if (
7348                                profile == default_profile
7349                                and f"{pz_prefix}Score" in list_of_pzfields
7350                            ):
7351                                sql_set_info.append(
7352                                    f"""
7353                                        concat(
7354                                            '{pz_prefix}Score=',
7355                                            {pz_prefix}Score{pzfields_sep}{profile}
7356                                        )
7357                                    """
7358                                )
7359
7360                        # PZFlag
7361                        if (
7362                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
7363                            in list_of_pzfields
7364                        ):
7365                            sql_set_info.append(
7366                                f"""
7367                                    concat(
7368                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
7369                                        CASE 
7370                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
7371                                            THEN 'PASS'
7372                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
7373                                            THEN 'FILTERED'
7374                                        END
7375                                    ) 
7376                                """
7377                            )
7378                            if (
7379                                profile == default_profile
7380                                and f"{pz_prefix}Flag" in list_of_pzfields
7381                            ):
7382                                sql_set_info.append(
7383                                    f"""
7384                                        concat(
7385                                            '{pz_prefix}Flag=',
7386                                            CASE 
7387                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
7388                                                THEN 'PASS'
7389                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
7390                                                THEN 'FILTERED'
7391                                            END
7392                                        )
7393                                    """
7394                                )
7395
7396                        # PZClass
7397                        if (
7398                            f"{pz_prefix}Class{pzfields_sep}{profile}"
7399                            in list_of_pzfields
7400                        ):
7401                            sql_set_info.append(
7402                                f"""
7403                                    concat(
7404                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
7405                                        CASE
7406                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7407                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7408                                            ELSE '.'
7409                                        END 
7410                                    )
7411                                    
7412                                """
7413                            )
7414                            if (
7415                                profile == default_profile
7416                                and f"{pz_prefix}Class" in list_of_pzfields
7417                            ):
7418                                sql_set_info.append(
7419                                    f"""
7420                                        concat(
7421                                            '{pz_prefix}Class=',
7422                                            CASE
7423                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7424                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7425                                                ELSE '.'
7426                                            END 
7427                                        )
7428                                    """
7429                                )
7430
7431                        # PZComment
7432                        if (
7433                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
7434                            in list_of_pzfields
7435                        ):
7436                            sql_set_info.append(
7437                                f"""
7438                                    CASE
7439                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
7440                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
7441                                        ELSE ''
7442                                    END
7443                                """
7444                            )
7445                            if (
7446                                profile == default_profile
7447                                and f"{pz_prefix}Comment" in list_of_pzfields
7448                            ):
7449                                sql_set_info.append(
7450                                    f"""
7451                                        CASE
7452                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
7453                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
7454                                            ELSE ''
7455                                        END
7456                                    """
7457                                )
7458
7459                        # PZInfos
7460                        if (
7461                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
7462                            in list_of_pzfields
7463                        ):
7464                            sql_set_info.append(
7465                                f"""
7466                                    CASE
7467                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
7468                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
7469                                        ELSE ''
7470                                    END
7471                                """
7472                            )
7473                            if (
7474                                profile == default_profile
7475                                and f"{pz_prefix}Infos" in list_of_pzfields
7476                            ):
7477                                sql_set_info.append(
7478                                    f"""
7479                                        CASE
7480                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
7481                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
7482                                            ELSE ''
7483                                        END
7484                                    """
7485                                )
7486
7487                        # Merge PZfields
7488                        sql_set_info_option = ""
7489                        sql_set_sep = ""
7490                        for sql_set in sql_set_info:
7491                            if sql_set_sep:
7492                                sql_set_info_option += f"""
7493                                    , concat('{sql_set_sep}', {sql_set})
7494                                """
7495                            else:
7496                                sql_set_info_option += f"""
7497                                    , {sql_set}
7498                                """
7499                            sql_set_sep = ";"
7500
7501                        sql_queries = []
7502                        for annotation in prioritizations_config[profile]:
7503
7504                            # skip special sections
7505                            if annotation.startswith("_"):
7506                                continue
7507
7508                            # For each criterions
7509                            for criterion in prioritizations_config[profile][
7510                                annotation
7511                            ]:
7512
7513                                # Criterion mode
7514                                criterion_mode = None
7515                                if np.any(
7516                                    np.isin(list(criterion.keys()), ["type", "value"])
7517                                ):
7518                                    criterion_mode = "operation"
7519                                elif np.any(
7520                                    np.isin(list(criterion.keys()), ["sql", "fields"])
7521                                ):
7522                                    criterion_mode = "sql"
7523                                log.debug(f"Criterion Mode: {criterion_mode}")
7524
7525                                # Criterion parameters
7526                                criterion_type = criterion.get("type", None)
7527                                criterion_value = criterion.get("value", None)
7528                                criterion_sql = criterion.get("sql", None)
7529                                criterion_fields = criterion.get("fields", None)
7530                                criterion_score = criterion.get("score", 0)
7531                                criterion_flag = criterion.get("flag", "PASS")
7532                                criterion_class = criterion.get("class", None)
7533                                criterion_flag_bool = criterion_flag == "PASS"
7534                                criterion_comment = (
7535                                    ", ".join(criterion.get("comment", []))
7536                                    .replace("'", "''")
7537                                    .replace(";", ",")
7538                                    .replace("\t", " ")
7539                                )
7540                                criterion_infos = (
7541                                    str(criterion)
7542                                    .replace("'", "''")
7543                                    .replace(";", ",")
7544                                    .replace("\t", " ")
7545                                )
7546
7547                                # SQL
7548                                if criterion_sql is not None and isinstance(
7549                                    criterion_sql, list
7550                                ):
7551                                    criterion_sql = " ".join(criterion_sql)
7552
7553                                # Fields and explode
7554                                if criterion_fields is None:
7555                                    criterion_fields = [annotation]
7556                                if not isinstance(criterion_fields, list):
7557                                    criterion_fields = str(criterion_fields).split(",")
7558
7559                                # Class
7560                                if criterion_class is not None and not isinstance(
7561                                    criterion_class, list
7562                                ):
7563                                    criterion_class = str(criterion_class).split(",")
7564
7565                                for annotation_field in criterion_fields:
7566
7567                                    # Explode specific annotation
7568                                    log.debug(
7569                                        f"Explode annotation '{annotation_field}'"
7570                                    )
7571                                    added_columns += self.explode_infos(
7572                                        prefix=explode_infos_prefix,
7573                                        fields=[annotation_field],
7574                                        table=table_variants,
7575                                    )
7576                                    extra_infos = self.get_extra_infos(
7577                                        table=table_variants
7578                                    )
7579
7580                                    # Check if annotation field is present
7581                                    if (
7582                                        f"{explode_infos_prefix}{annotation_field}"
7583                                        not in extra_infos
7584                                    ):
7585                                        msq_err = f"Annotation '{annotation_field}' not in data"
7586                                        log.error(msq_err)
7587                                        raise ValueError(msq_err)
7588                                    else:
7589                                        log.debug(
7590                                            f"Annotation '{annotation_field}' in data"
7591                                        )
7592
7593                                sql_set = []
7594                                sql_set_info = []
7595
7596                                # PZ fields set
7597
7598                                # PZScore
7599                                if (
7600                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
7601                                    in list_of_pzfields
7602                                ):
7603                                    # VaRank prioritization score mode
7604                                    if prioritization_score_mode.upper().strip() in [
7605                                        "VARANK",
7606                                        "MAX",
7607                                        "MAXIMUM",
7608                                        "TOP",
7609                                    ]:
7610                                        sql_set.append(
7611                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END "
7612                                        )
7613                                    # default HOWARD prioritization score mode
7614                                    else:
7615                                        sql_set.append(
7616                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
7617                                        )
7618
7619                                # PZFlag
7620                                if (
7621                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
7622                                    in list_of_pzfields
7623                                ):
7624                                    sql_set.append(
7625                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
7626                                    )
7627
7628                                # PZClass
7629                                if (
7630                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
7631                                    in list_of_pzfields
7632                                    and criterion_class is not None
7633                                ):
7634                                    sql_set.append(
7635                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
7636                                    )
7637
7638                                # PZComment
7639                                if (
7640                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
7641                                    in list_of_pzfields
7642                                ):
7643                                    sql_set.append(
7644                                        f"""
7645                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
7646                                                concat(
7647                                                    {pz_prefix}Comment{pzfields_sep}{profile},
7648                                                    CASE 
7649                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
7650                                                        THEN ', '
7651                                                        ELSE ''
7652                                                    END,
7653                                                    '{criterion_comment}'
7654                                                )
7655                                        """
7656                                    )
7657
7658                                # PZInfos
7659                                if (
7660                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
7661                                    in list_of_pzfields
7662                                ):
7663                                    sql_set.append(
7664                                        f"""
7665                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
7666                                                concat(
7667                                                    {pz_prefix}Infos{pzfields_sep}{profile},
7668                                                    '{criterion_infos}'
7669                                                )
7670                                        """
7671                                    )
7672                                sql_set_option = ",".join(sql_set)
7673
7674                                # Criterion and comparison
7675                                if sql_set_option:
7676
7677                                    if criterion_mode in ["operation"]:
7678
7679                                        try:
7680                                            float(criterion_value)
7681                                            sql_update = f"""
7682                                                UPDATE {table_variants}
7683                                                SET {sql_set_option}
7684                                                WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
7685                                                AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
7686                                            """
7687                                        except:
7688                                            contains_option = ""
7689                                            if criterion_type == "contains":
7690                                                contains_option = ".*"
7691                                            sql_update = f"""
7692                                                UPDATE {table_variants}
7693                                                SET {sql_set_option}
7694                                                WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
7695                                            """
7696                                        sql_queries.append(sql_update)
7697
7698                                    elif criterion_mode in ["sql"]:
7699
7700                                        sql_update = f"""
7701                                            UPDATE {table_variants}
7702                                            SET {sql_set_option}
7703                                            WHERE {criterion_sql}
7704                                        """
7705                                        sql_queries.append(sql_update)
7706
7707                                    else:
7708                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
7709                                        log.error(msg_err)
7710                                        raise ValueError(msg_err)
7711
7712                                else:
7713                                    log.warning(
7714                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
7715                                    )
7716
7717                        # PZTags
7718                        if (
7719                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
7720                            in list_of_pzfields
7721                        ):
7722
7723                            # Create PZFalgs value
7724                            pztags_value = ""
7725                            pztags_sep_default = ","
7726                            pztags_sep = ""
7727                            for pzfield in pzfields:
7728                                if pzfield not in [f"{pz_prefix}Tags"]:
7729                                    if (
7730                                        f"{pzfield}{pzfields_sep}{profile}"
7731                                        in list_of_pzfields
7732                                    ):
7733                                        if pzfield in [f"{pz_prefix}Flag"]:
7734                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7735                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
7736                                                    THEN 'PASS'
7737                                                    ELSE 'FILTERED'
7738                                                END, '"""
7739                                        elif pzfield in [f"{pz_prefix}Class"]:
7740                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7741                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7742                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7743                                                    ELSE '.'
7744                                                END, '"""
7745                                        else:
7746                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
7747                                        pztags_sep = pztags_sep_default
7748
7749                            # Add Query update for PZFlags
7750                            sql_update_pztags = f"""
7751                                UPDATE {table_variants}
7752                                SET INFO = concat(
7753                                        INFO,
7754                                        CASE WHEN INFO NOT in ('','.')
7755                                                THEN ';'
7756                                                ELSE ''
7757                                        END,
7758                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
7759                                    )
7760                                """
7761                            sql_queries.append(sql_update_pztags)
7762
7763                            # Add Query update for PZFlags for default
7764                            if profile == default_profile:
7765                                sql_update_pztags_default = f"""
7766                                UPDATE {table_variants}
7767                                SET INFO = concat(
7768                                        INFO,
7769                                        ';',
7770                                        '{pz_prefix}Tags={pztags_value}'
7771                                    )
7772                                """
7773                                sql_queries.append(sql_update_pztags_default)
7774
7775                        log.info(f"""Profile '{profile}' - Prioritization... """)
7776
7777                        if sql_queries:
7778
7779                            for sql_query in sql_queries:
7780                                log.debug(
7781                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
7782                                )
7783                                self.conn.execute(sql_query)
7784
7785                        log.info(f"""Profile '{profile}' - Update... """)
7786                        sql_query_update = f"""
7787                            UPDATE {table_variants}
7788                            SET INFO =  
7789                                concat(
7790                                    CASE
7791                                        WHEN INFO NOT IN ('','.')
7792                                        THEN concat(INFO, ';')
7793                                        ELSE ''
7794                                    END
7795                                    {sql_set_info_option}
7796                                )
7797                        """
7798                        self.conn.execute(sql_query_update)
7799
7800        else:
7801
7802            log.warning(f"No profiles in parameters")
7803
7804        # Remove added columns
7805        for added_column in added_columns:
7806            self.drop_column(column=added_column)
7807
7808        # Explode INFOS fields into table fields
7809        if self.get_explode_infos():
7810            self.explode_infos(
7811                prefix=self.get_explode_infos_prefix(),
7812                fields=self.get_explode_infos_fields(),
7813                force=True,
7814            )
7815
7816        return True

The prioritization function in Python processes VCF files, adds new INFO fields, and prioritizes variants based on configured profiles and criteria.

Parameters
  • table: The table parameter in the prioritization function is used to specify the name of the table (presumably a VCF file) on which the prioritization operation will be performed. If a table name is provided, the method will prioritize the variants in that specific table
  • pz_prefix: The pz_prefix parameter is used to specify a prefix that will be added to certain INFO fields in a VCF file during the prioritization process. If this parameter is not provided, the code will use a default prefix value of "PZ"
  • pz_param: The pz_param parameter in the prioritization method is used to pass additional parameters specific to the prioritization process. These parameters can include settings related to prioritization profiles, fields, scoring modes, flags, comments, and other configurations needed for the prioritization of variants in a V
Returns

A boolean value (True) is being returned from the prioritization function.

def annotation_hgvs(self, threads: int = None) -> None:
7822    def annotation_hgvs(self, threads: int = None) -> None:
7823        """
7824        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
7825        coordinates and alleles.
7826
7827        :param threads: The `threads` parameter is an optional integer that specifies the number of
7828        threads to use for parallel processing. If no value is provided, it will default to the number
7829        of threads obtained from the `get_threads()` method
7830        :type threads: int
7831        """
7832
7833        # Function for each partition of the Dask Dataframe
7834        def partition_function(partition):
7835            """
7836            The function `partition_function` applies the `annotation_hgvs_partition` function to
7837            each row of a DataFrame called `partition`.
7838
7839            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
7840            to be processed
7841            :return: the result of applying the "annotation_hgvs_partition" function to each row of
7842            the "partition" dataframe along the axis 1.
7843            """
7844            return partition.apply(annotation_hgvs_partition, axis=1)
7845
7846        def annotation_hgvs_partition(row) -> str:
7847            """
7848            The function `annotation_hgvs_partition` takes in a row of data and returns a string
7849            containing a list of HGVS names associated with the given genomic coordinates and alleles.
7850
7851            :param row: A dictionary-like object that contains the values for the following keys:
7852            :return: a string that contains the HGVS names associated with the given row of data.
7853            """
7854
7855            chr = row["CHROM"]
7856            pos = row["POS"]
7857            ref = row["REF"]
7858            alt = row["ALT"]
7859
7860            # Find list of associated transcripts
7861            transcripts_list = list(
7862                polars_conn.execute(
7863                    f"""
7864                SELECT transcript
7865                FROM refseq_df
7866                WHERE CHROM='{chr}'
7867                AND POS={pos}
7868            """
7869                )["transcript"]
7870            )
7871
7872            # Full HGVS annotation in list
7873            hgvs_full_list = []
7874
7875            for transcript_name in transcripts_list:
7876
7877                # Transcript
7878                transcript = get_transcript(
7879                    transcripts=transcripts, transcript_name=transcript_name
7880                )
7881                # Exon
7882                if use_exon:
7883                    exon = transcript.find_exon_number(pos)
7884                else:
7885                    exon = None
7886                # Protein
7887                transcript_protein = None
7888                if use_protein or add_protein or full_format:
7889                    transcripts_protein = list(
7890                        polars_conn.execute(
7891                            f"""
7892                        SELECT protein
7893                        FROM refseqlink_df
7894                        WHERE transcript='{transcript_name}'
7895                        LIMIT 1
7896                    """
7897                        )["protein"]
7898                    )
7899                    if len(transcripts_protein):
7900                        transcript_protein = transcripts_protein[0]
7901
7902                # HGVS name
7903                hgvs_name = format_hgvs_name(
7904                    chr,
7905                    pos,
7906                    ref,
7907                    alt,
7908                    genome=genome,
7909                    transcript=transcript,
7910                    transcript_protein=transcript_protein,
7911                    exon=exon,
7912                    use_gene=use_gene,
7913                    use_protein=use_protein,
7914                    full_format=full_format,
7915                    use_version=use_version,
7916                    codon_type=codon_type,
7917                )
7918                hgvs_full_list.append(hgvs_name)
7919                if add_protein and not use_protein and not full_format:
7920                    hgvs_name = format_hgvs_name(
7921                        chr,
7922                        pos,
7923                        ref,
7924                        alt,
7925                        genome=genome,
7926                        transcript=transcript,
7927                        transcript_protein=transcript_protein,
7928                        exon=exon,
7929                        use_gene=use_gene,
7930                        use_protein=True,
7931                        full_format=False,
7932                        use_version=use_version,
7933                        codon_type=codon_type,
7934                    )
7935                    hgvs_full_list.append(hgvs_name)
7936
7937            # Create liste of HGVS annotations
7938            hgvs_full = ",".join(hgvs_full_list)
7939
7940            return hgvs_full
7941
7942        # Polars connexion
7943        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7944
7945        # Config
7946        config = self.get_config()
7947
7948        # Databases
7949        # Genome
7950        databases_genomes_folders = (
7951            config.get("folders", {})
7952            .get("databases", {})
7953            .get("genomes", DEFAULT_GENOME_FOLDER)
7954        )
7955        databases_genome = (
7956            config.get("folders", {}).get("databases", {}).get("genomes", "")
7957        )
7958        # refseq database folder
7959        databases_refseq_folders = (
7960            config.get("folders", {})
7961            .get("databases", {})
7962            .get("refseq", DEFAULT_REFSEQ_FOLDER)
7963        )
7964        # refseq
7965        databases_refseq = config.get("databases", {}).get("refSeq", None)
7966        # refSeqLink
7967        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
7968
7969        # Param
7970        param = self.get_param()
7971
7972        # Quick HGVS
7973        if "hgvs_options" in param and param.get("hgvs_options", ""):
7974            log.info(f"Quick HGVS Annotation:")
7975            if not param.get("hgvs", None):
7976                param["hgvs"] = {}
7977            for option in param.get("hgvs_options", "").split(","):
7978                option_var_val = option.split("=")
7979                option_var = option_var_val[0]
7980                if len(option_var_val) > 1:
7981                    option_val = option_var_val[1]
7982                else:
7983                    option_val = "True"
7984                if option_val.upper() in ["TRUE"]:
7985                    option_val = True
7986                elif option_val.upper() in ["FALSE"]:
7987                    option_val = False
7988                log.info(f"   {option_var}={option_val}")
7989                param["hgvs"][option_var] = option_val
7990
7991        # Check if HGVS annotation enabled
7992        if "hgvs" in param:
7993            log.info(f"HGVS Annotation... ")
7994            for hgvs_option in param.get("hgvs", {}):
7995                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
7996        else:
7997            return
7998
7999        # HGVS Param
8000        param_hgvs = param.get("hgvs", {})
8001        use_exon = param_hgvs.get("use_exon", False)
8002        use_gene = param_hgvs.get("use_gene", False)
8003        use_protein = param_hgvs.get("use_protein", False)
8004        add_protein = param_hgvs.get("add_protein", False)
8005        full_format = param_hgvs.get("full_format", False)
8006        use_version = param_hgvs.get("use_version", False)
8007        codon_type = param_hgvs.get("codon_type", "3")
8008
8009        # refSseq refSeqLink
8010        databases_refseq = param_hgvs.get("refseq", databases_refseq)
8011        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
8012
8013        # Assembly
8014        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
8015
8016        # Genome
8017        genome_file = None
8018        if find_genome(databases_genome):
8019            genome_file = find_genome(databases_genome)
8020        else:
8021            genome_file = find_genome(
8022                genome_path=databases_genomes_folders, assembly=assembly
8023            )
8024        log.debug("Genome: " + str(genome_file))
8025
8026        # refSseq
8027        refseq_file = find_file_prefix(
8028            input_file=databases_refseq,
8029            prefix="ncbiRefSeq",
8030            folder=databases_refseq_folders,
8031            assembly=assembly,
8032        )
8033        log.debug("refSeq: " + str(refseq_file))
8034
8035        # refSeqLink
8036        refseqlink_file = find_file_prefix(
8037            input_file=databases_refseqlink,
8038            prefix="ncbiRefSeqLink",
8039            folder=databases_refseq_folders,
8040            assembly=assembly,
8041        )
8042        log.debug("refSeqLink: " + str(refseqlink_file))
8043
8044        # Threads
8045        if not threads:
8046            threads = self.get_threads()
8047        log.debug("Threads: " + str(threads))
8048
8049        # Variables
8050        table_variants = self.get_table_variants(clause="update")
8051
8052        # Get variants SNV and InDel only
8053        query_variants = f"""
8054            SELECT "#CHROM" AS CHROM, POS, REF, ALT
8055            FROM {table_variants}
8056            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
8057            """
8058        df_variants = self.get_query_to_df(query_variants)
8059
8060        # Added columns
8061        added_columns = []
8062
8063        # Add hgvs column in variants table
8064        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
8065        added_column = self.add_column(
8066            table_variants, hgvs_column_name, "STRING", default_value=None
8067        )
8068        added_columns.append(added_column)
8069
8070        log.debug(f"refSeq loading...")
8071        # refSeq in duckDB
8072        refseq_table = get_refseq_table(
8073            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
8074        )
8075        # Loading all refSeq in Dataframe
8076        refseq_query = f"""
8077            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
8078            FROM {refseq_table}
8079            JOIN df_variants ON (
8080                {refseq_table}.chrom = df_variants.CHROM
8081                AND {refseq_table}.txStart<=df_variants.POS
8082                AND {refseq_table}.txEnd>=df_variants.POS
8083            )
8084        """
8085        refseq_df = self.conn.query(refseq_query).pl()
8086
8087        if refseqlink_file:
8088            log.debug(f"refSeqLink loading...")
8089            # refSeqLink in duckDB
8090            refseqlink_table = get_refseq_table(
8091                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
8092            )
8093            # Loading all refSeqLink in Dataframe
8094            protacc_column = "protAcc_with_ver"
8095            mrnaacc_column = "mrnaAcc_with_ver"
8096            refseqlink_query = f"""
8097                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
8098                FROM {refseqlink_table} 
8099                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
8100                WHERE protAcc_without_ver IS NOT NULL
8101            """
8102            # Polars Dataframe
8103            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
8104
8105        # Read RefSeq transcripts into a python dict/model.
8106        log.debug(f"Transcripts loading...")
8107        with tempfile.TemporaryDirectory() as tmpdir:
8108            transcripts_query = f"""
8109                COPY (
8110                    SELECT {refseq_table}.*
8111                    FROM {refseq_table}
8112                    JOIN df_variants ON (
8113                        {refseq_table}.chrom=df_variants.CHROM
8114                        AND {refseq_table}.txStart<=df_variants.POS
8115                        AND {refseq_table}.txEnd>=df_variants.POS
8116                    )
8117                )
8118                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
8119            """
8120            self.conn.query(transcripts_query)
8121            with open(f"{tmpdir}/transcript.tsv") as infile:
8122                transcripts = read_transcripts(infile)
8123
8124        # Polars connexion
8125        polars_conn = pl.SQLContext(register_globals=True, eager=True)
8126
8127        log.debug("Genome loading...")
8128        # Read genome sequence using pyfaidx.
8129        genome = Fasta(genome_file)
8130
8131        log.debug("Start annotation HGVS...")
8132
8133        # Create
8134        # a Dask Dataframe from Pandas dataframe with partition as number of threads
8135        ddf = dd.from_pandas(df_variants, npartitions=threads)
8136
8137        # Use dask.dataframe.apply() to apply function on each partition
8138        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
8139
8140        # Convert Dask DataFrame to Pandas Dataframe
8141        df = ddf.compute()
8142
8143        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
8144        with tempfile.TemporaryDirectory() as tmpdir:
8145            df_parquet = os.path.join(tmpdir, "df.parquet")
8146            df.to_parquet(df_parquet)
8147
8148            # Update hgvs column
8149            update_variant_query = f"""
8150                UPDATE {table_variants}
8151                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
8152                FROM read_parquet('{df_parquet}') as df
8153                WHERE variants."#CHROM" = df.CHROM
8154                AND variants.POS = df.POS
8155                AND variants.REF = df.REF
8156                AND variants.ALT = df.ALT
8157                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
8158                """
8159            self.execute_query(update_variant_query)
8160
8161        # Update INFO column
8162        sql_query_update = f"""
8163            UPDATE {table_variants}
8164            SET INFO = 
8165                concat(
8166                    CASE 
8167                        WHEN INFO NOT IN ('','.')
8168                        THEN concat(INFO, ';')
8169                        ELSE ''
8170                    END,
8171                    'hgvs=',
8172                    {hgvs_column_name}
8173                )
8174            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
8175            """
8176        self.execute_query(sql_query_update)
8177
8178        # Add header
8179        HGVS_INFOS = {
8180            "hgvs": {
8181                "ID": "hgvs",
8182                "Number": ".",
8183                "Type": "String",
8184                "Description": f"HGVS annotatation with HOWARD",
8185            }
8186        }
8187
8188        for field in HGVS_INFOS:
8189            field_ID = HGVS_INFOS[field]["ID"]
8190            field_description = HGVS_INFOS[field]["Description"]
8191            self.get_header().infos[field_ID] = vcf.parser._Info(
8192                field_ID,
8193                HGVS_INFOS[field]["Number"],
8194                HGVS_INFOS[field]["Type"],
8195                field_description,
8196                "unknown",
8197                "unknown",
8198                code_type_map[HGVS_INFOS[field]["Type"]],
8199            )
8200
8201        # Remove added columns
8202        for added_column in added_columns:
8203            self.drop_column(column=added_column)

The annotation_hgvs function performs HGVS annotation on a set of variants using genomic coordinates and alleles.

Parameters
  • threads: The threads parameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from the get_threads() method
def get_operations_help( self, operations_config_dict: dict = {}, operations_config_file: str = None) -> list:
8209    def get_operations_help(
8210        self, operations_config_dict: dict = {}, operations_config_file: str = None
8211    ) -> list:
8212
8213        # Init
8214        operations_help = []
8215
8216        # operations
8217        operations = self.get_config_json(
8218            name="calculations",
8219            config_dict=operations_config_dict,
8220            config_file=operations_config_file,
8221        )
8222        for op in operations:
8223            op_name = operations[op].get("name", op).upper()
8224            op_description = operations[op].get("description", op_name)
8225            op_available = operations[op].get("available", False)
8226            if op_available:
8227                operations_help.append(f"   {op_name}: {op_description}")
8228
8229        # Sort operations
8230        operations_help.sort()
8231
8232        # insert header
8233        operations_help.insert(0, "Available calculation operations:")
8234
8235        # Return
8236        return operations_help
def calculation( self, operations: dict = {}, operations_config_dict: dict = {}, operations_config_file: str = None) -> None:
8238    def calculation(
8239        self,
8240        operations: dict = {},
8241        operations_config_dict: dict = {},
8242        operations_config_file: str = None,
8243    ) -> None:
8244        """
8245        It takes a list of operations, and for each operation, it checks if it's a python or sql
8246        operation, and then calls the appropriate function
8247
8248        param json example:
8249            "calculation": {
8250                "NOMEN": {
8251                    "options": {
8252                        "hgvs_field": "hgvs"
8253                    },
8254                "middle" : null
8255            }
8256        """
8257
8258        # Param
8259        param = self.get_param()
8260
8261        # CHeck operations config file
8262        if operations_config_file is None:
8263            operations_config_file = param.get("calculation", {}).get(
8264                "calculation_config", None
8265            )
8266
8267        # operations config
8268        operations_config = self.get_config_json(
8269            name="calculations",
8270            config_dict=operations_config_dict,
8271            config_file=operations_config_file,
8272        )
8273
8274        # Upper keys
8275        operations_config = {k.upper(): v for k, v in operations_config.items()}
8276
8277        # Calculations
8278
8279        # Operations from param
8280        operations = param.get("calculation", {}).get("calculations", operations)
8281
8282        # Quick calculation - add
8283        if param.get("calculations", None):
8284
8285            # List of operations
8286            calculations_list = [
8287                value.strip() for value in param.get("calculations", "").split(",")
8288            ]
8289
8290            # Log
8291            log.info(f"Quick Calculations:")
8292            for calculation_key in calculations_list:
8293                log.info(f"   {calculation_key}")
8294
8295            # Create tmp operations (to keep operation order)
8296            operations_tmp = {}
8297            for calculation_operation in calculations_list:
8298                if calculation_operation.upper() not in operations_tmp:
8299                    log.debug(
8300                        f"{calculation_operation}.upper() not in {operations_tmp}"
8301                    )
8302                    operations_tmp[calculation_operation.upper()] = {}
8303                    add_value_into_dict(
8304                        dict_tree=operations_tmp,
8305                        sections=[
8306                            calculation_operation.upper(),
8307                        ],
8308                        value=operations.get(calculation_operation.upper(), {}),
8309                    )
8310            # Add operations already in param
8311            for calculation_operation in operations:
8312                if calculation_operation not in operations_tmp:
8313                    operations_tmp[calculation_operation] = operations.get(
8314                        calculation_operation, {}
8315                    )
8316
8317            # Update operations in param
8318            operations = operations_tmp
8319
8320        # Operations for calculation
8321        if not operations:
8322            operations = param.get("calculation", {}).get("calculations", {})
8323
8324        if operations:
8325            log.info(f"Calculations...")
8326
8327        # For each operations
8328        for operation_name in operations:
8329            operation_name = operation_name.upper()
8330            if operation_name not in [""]:
8331                if operation_name in operations_config:
8332                    log.info(f"Calculation '{operation_name}'")
8333                    operation = operations_config[operation_name]
8334                    operation_type = operation.get("type", "sql")
8335                    if operation_type == "python":
8336                        self.calculation_process_function(
8337                            operation=operation, operation_name=operation_name
8338                        )
8339                    elif operation_type == "sql":
8340                        self.calculation_process_sql(
8341                            operation=operation, operation_name=operation_name
8342                        )
8343                    else:
8344                        log.error(
8345                            f"Operations config: Type '{operation_type}' NOT available"
8346                        )
8347                        raise ValueError(
8348                            f"Operations config: Type '{operation_type}' NOT available"
8349                        )
8350                else:
8351                    log.error(
8352                        f"Operations config: Calculation '{operation_name}' NOT available"
8353                    )
8354                    raise ValueError(
8355                        f"Operations config: Calculation '{operation_name}' NOT available"
8356                    )
8357
8358        # Explode INFOS fields into table fields
8359        if self.get_explode_infos():
8360            self.explode_infos(
8361                prefix=self.get_explode_infos_prefix(),
8362                fields=self.get_explode_infos_fields(),
8363                force=True,
8364            )

It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function

param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }

def calculation_process_sql(self, operation: dict, operation_name: str = 'unknown') -> None:
8366    def calculation_process_sql(
8367        self, operation: dict, operation_name: str = "unknown"
8368    ) -> None:
8369        """
8370        The `calculation_process_sql` function takes in a mathematical operation as a string and
8371        performs the operation, updating the specified table with the result.
8372
8373        :param operation: The `operation` parameter is a dictionary that contains information about the
8374        mathematical operation to be performed. It includes the following keys:
8375        :type operation: dict
8376        :param operation_name: The `operation_name` parameter is a string that represents the name of
8377        the mathematical operation being performed. It is used for logging and error handling purposes,
8378        defaults to unknown
8379        :type operation_name: str (optional)
8380        """
8381
8382        # Operation infos
8383        operation_name = operation.get("name", "unknown")
8384        log.debug(f"process SQL {operation_name}")
8385        output_column_name = operation.get("output_column_name", operation_name)
8386        output_column_type = operation.get("output_column_type", "String")
8387        prefix = operation.get("explode_infos_prefix", "")
8388        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
8389        output_column_description = operation.get(
8390            "output_column_description", f"{operation_name} operation"
8391        )
8392        operation_query = operation.get("operation_query", None)
8393        if isinstance(operation_query, list):
8394            operation_query = " ".join(operation_query)
8395        operation_info_fields = operation.get("info_fields", [])
8396        operation_info_fields_check = operation.get("info_fields_check", False)
8397        operation_info = operation.get("operation_info", True)
8398        operation_table = operation.get(
8399            "table", self.get_table_variants(clause="alter")
8400        )
8401
8402        # table variants
8403        if operation_table:
8404            table_variants = operation_table
8405        else:
8406            table_variants = self.get_table_variants(clause="alter")
8407
8408        if operation_query:
8409
8410            # Info fields check
8411            operation_info_fields_check_result = True
8412            if operation_info_fields_check:
8413                header_infos = self.get_header().infos
8414                for info_field in operation_info_fields:
8415                    operation_info_fields_check_result = (
8416                        operation_info_fields_check_result
8417                        and info_field in header_infos
8418                    )
8419
8420            # If info fields available
8421            if operation_info_fields_check_result:
8422
8423                # Added_columns
8424                added_columns = []
8425
8426                # Create VCF header field
8427                vcf_reader = self.get_header()
8428                vcf_reader.infos[output_column_name] = vcf.parser._Info(
8429                    output_column_name,
8430                    ".",
8431                    output_column_type,
8432                    output_column_description,
8433                    "howard calculation",
8434                    "0",
8435                    self.code_type_map.get(output_column_type),
8436                )
8437
8438                # Explode infos if needed
8439                log.debug(f"calculation_process_sql prefix {prefix}")
8440                added_columns += self.explode_infos(
8441                    prefix=prefix,
8442                    fields=[output_column_name] + operation_info_fields,
8443                    force=False,
8444                    table=table_variants,
8445                )
8446
8447                # Create column
8448                added_column = self.add_column(
8449                    table_name=table_variants,
8450                    column_name=prefix + output_column_name,
8451                    column_type=output_column_type_sql,
8452                    default_value="null",
8453                )
8454                added_columns.append(added_column)
8455
8456                # Operation calculation
8457                try:
8458
8459                    # Query to update calculation column
8460                    sql_update = f"""
8461                        UPDATE {table_variants}
8462                        SET "{prefix}{output_column_name}" = ({operation_query})
8463                    """
8464                    self.conn.execute(sql_update)
8465
8466                    # Add to INFO
8467                    if operation_info:
8468                        sql_update_info = f"""
8469                            UPDATE {table_variants}
8470                            SET "INFO" =
8471                                concat(
8472                                    CASE
8473                                        WHEN "INFO" IS NOT NULL
8474                                        THEN concat("INFO", ';')
8475                                        ELSE ''
8476                                    END,
8477                                    '{output_column_name}=',
8478                                    "{prefix}{output_column_name}"
8479                                )
8480                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
8481                        """
8482                        self.conn.execute(sql_update_info)
8483
8484                except:
8485                    log.error(
8486                        f"Operations config: Calculation '{operation_name}' query failed"
8487                    )
8488                    raise ValueError(
8489                        f"Operations config: Calculation '{operation_name}' query failed"
8490                    )
8491
8492                # Remove added columns
8493                for added_column in added_columns:
8494                    log.debug(f"added_column: {added_column}")
8495                    self.drop_column(column=added_column)
8496
8497            else:
8498                log.error(
8499                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
8500                )
8501                raise ValueError(
8502                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
8503                )
8504
8505        else:
8506            log.error(
8507                f"Operations config: Calculation '{operation_name}' query NOT defined"
8508            )
8509            raise ValueError(
8510                f"Operations config: Calculation '{operation_name}' query NOT defined"
8511            )

The calculation_process_sql function takes in a mathematical operation as a string and performs the operation, updating the specified table with the result.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
def calculation_process_function(self, operation: dict, operation_name: str = 'unknown') -> None:
8513    def calculation_process_function(
8514        self, operation: dict, operation_name: str = "unknown"
8515    ) -> None:
8516        """
8517        The `calculation_process_function` takes in an operation dictionary and performs the specified
8518        function with the given parameters.
8519
8520        :param operation: The `operation` parameter is a dictionary that contains information about the
8521        operation to be performed. It has the following keys:
8522        :type operation: dict
8523        :param operation_name: The `operation_name` parameter is a string that represents the name of
8524        the operation being performed. It is used for logging purposes, defaults to unknown
8525        :type operation_name: str (optional)
8526        """
8527
8528        operation_name = operation["name"]
8529        log.debug(f"process Python {operation_name}")
8530        function_name = operation["function_name"]
8531        function_params = operation["function_params"]
8532        getattr(self, function_name)(*function_params)

The calculation_process_function takes in an operation dictionary and performs the specified function with the given parameters.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the operation to be performed. It has the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
def calculation_variant_id(self) -> None:
8534    def calculation_variant_id(self) -> None:
8535        """
8536        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
8537        updates the INFO field of a variants table with the variant ID.
8538        """
8539
8540        # variant_id annotation field
8541        variant_id_tag = self.get_variant_id_column()
8542        added_columns = [variant_id_tag]
8543
8544        # variant_id hgvs tags"
8545        vcf_infos_tags = {
8546            variant_id_tag: "howard variant ID annotation",
8547        }
8548
8549        # Variants table
8550        table_variants = self.get_table_variants()
8551
8552        # Header
8553        vcf_reader = self.get_header()
8554
8555        # Add variant_id to header
8556        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
8557            variant_id_tag,
8558            ".",
8559            "String",
8560            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
8561            "howard calculation",
8562            "0",
8563            self.code_type_map.get("String"),
8564        )
8565
8566        # Update
8567        sql_update = f"""
8568            UPDATE {table_variants}
8569            SET "INFO" = 
8570                concat(
8571                    CASE
8572                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8573                        THEN ''
8574                        ELSE concat("INFO", ';')
8575                    END,
8576                    '{variant_id_tag}=',
8577                    "{variant_id_tag}"
8578                )
8579        """
8580        self.conn.execute(sql_update)
8581
8582        # Remove added columns
8583        for added_column in added_columns:
8584            self.drop_column(column=added_column)

The function calculation_variant_id adds a variant ID annotation to a VCF file header and updates the INFO field of a variants table with the variant ID.

def calculation_extract_snpeff_hgvs( self, snpeff_hgvs: str = 'snpeff_hgvs', snpeff_field: str = 'ANN') -> None:
8586    def calculation_extract_snpeff_hgvs(
8587        self,
8588        snpeff_hgvs: str = "snpeff_hgvs",
8589        snpeff_field: str = "ANN",
8590    ) -> None:
8591        """
8592        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
8593        annotation field in a VCF file and adds them as a new column in the variants table.
8594
8595        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
8596        function is used to specify the name of the column that will store the HGVS nomenclatures
8597        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
8598        snpeff_hgvs
8599        :type snpeff_hgvs: str (optional)
8600        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
8601        function represents the field in the VCF file that contains SnpEff annotations. This field is
8602        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
8603        to ANN
8604        :type snpeff_field: str (optional)
8605        """
8606
8607        # Snpeff hgvs tags
8608        vcf_infos_tags = {
8609            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
8610        }
8611
8612        # Prefix
8613        prefix = self.get_explode_infos_prefix()
8614        if prefix:
8615            prefix = "INFO/"
8616
8617        # snpEff fields
8618        speff_ann_infos = prefix + snpeff_field
8619        speff_hgvs_infos = prefix + snpeff_hgvs
8620
8621        # Variants table
8622        table_variants = self.get_table_variants()
8623
8624        # Header
8625        vcf_reader = self.get_header()
8626
8627        # Add columns
8628        added_columns = []
8629
8630        # Explode HGVS field in column
8631        added_columns += self.explode_infos(fields=[snpeff_field])
8632
8633        if snpeff_field in vcf_reader.infos:
8634
8635            log.debug(vcf_reader.infos[snpeff_field])
8636
8637            # Extract ANN header
8638            ann_description = vcf_reader.infos[snpeff_field].desc
8639            pattern = r"'(.+?)'"
8640            match = re.search(pattern, ann_description)
8641            if match:
8642                ann_header_match = match.group(1).split(" | ")
8643                ann_header_desc = {}
8644                for i in range(len(ann_header_match)):
8645                    ann_header_info = "".join(
8646                        char for char in ann_header_match[i] if char.isalnum()
8647                    )
8648                    ann_header_desc[ann_header_info] = ann_header_match[i]
8649                if not ann_header_desc:
8650                    raise ValueError("Invalid header description format")
8651            else:
8652                raise ValueError("Invalid header description format")
8653
8654            # Create variant id
8655            variant_id_column = self.get_variant_id_column()
8656            added_columns += [variant_id_column]
8657
8658            # Create dataframe
8659            dataframe_snpeff_hgvs = self.get_query_to_df(
8660                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8661            )
8662
8663            # Create main NOMEN column
8664            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8665                speff_ann_infos
8666            ].apply(
8667                lambda x: extract_snpeff_hgvs(
8668                    str(x), header=list(ann_header_desc.values())
8669                )
8670            )
8671
8672            # Add snpeff_hgvs to header
8673            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
8674                snpeff_hgvs,
8675                ".",
8676                "String",
8677                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
8678                "howard calculation",
8679                "0",
8680                self.code_type_map.get("String"),
8681            )
8682
8683            # Update
8684            sql_update = f"""
8685                UPDATE variants
8686                SET "INFO" = 
8687                    concat(
8688                        CASE
8689                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8690                            THEN ''
8691                            ELSE concat("INFO", ';')
8692                        END,
8693                        CASE 
8694                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8695                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8696                            THEN concat(
8697                                    '{snpeff_hgvs}=',
8698                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8699                                )
8700                            ELSE ''
8701                        END
8702                    )
8703                FROM dataframe_snpeff_hgvs
8704                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8705
8706            """
8707            self.conn.execute(sql_update)
8708
8709            # Delete dataframe
8710            del dataframe_snpeff_hgvs
8711            gc.collect()
8712
8713        else:
8714
8715            log.warning(
8716                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8717            )
8718
8719        # Remove added columns
8720        for added_column in added_columns:
8721            self.drop_column(column=added_column)

The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff annotation field in a VCF file and adds them as a new column in the variants table.

Parameters
  • snpeff_hgvs: The snpeff_hgvs parameter in the calculation_extract_snpeff_hgvs function is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs
  • snpeff_field: The snpeff_field parameter in the calculation_extract_snpeff_hgvs function represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
def calculation_snpeff_ann_explode( self, uniquify: bool = True, output_format: str = 'fields', output_prefix: str = 'snpeff_', snpeff_field: str = 'ANN') -> None:
8723    def calculation_snpeff_ann_explode(
8724        self,
8725        uniquify: bool = True,
8726        output_format: str = "fields",
8727        output_prefix: str = "snpeff_",
8728        snpeff_field: str = "ANN",
8729    ) -> None:
8730        """
8731        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
8732        exploding the HGVS field and updating variant information accordingly.
8733
8734        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
8735        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
8736        it indicates that the output should be unique, meaning that duplicate entries should be removed,
8737        defaults to True
8738        :type uniquify: bool (optional)
8739        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
8740        function specifies the format in which the output annotations will be generated. It has a
8741        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
8742        format, defaults to fields
8743        :type output_format: str (optional)
8744        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
8745        method is used to specify the prefix that will be added to the output annotations generated
8746        during the calculation process. This prefix helps to differentiate the newly added annotations
8747        from existing ones in the output data. By default, the, defaults to ANN_
8748        :type output_prefix: str (optional)
8749        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
8750        function is used to specify the field in the VCF file that contains SnpEff annotations. This
8751        field will be processed to explode the HGVS annotations and update the variant information
8752        accordingly, defaults to ANN
8753        :type snpeff_field: str (optional)
8754        """
8755
8756        # SnpEff annotation field
8757        snpeff_hgvs = "snpeff_ann_explode"
8758
8759        # Snpeff hgvs tags
8760        vcf_infos_tags = {
8761            snpeff_hgvs: "Explode snpEff annotations",
8762        }
8763
8764        # Prefix
8765        prefix = self.get_explode_infos_prefix()
8766        if prefix:
8767            prefix = "INFO/"
8768
8769        # snpEff fields
8770        speff_ann_infos = prefix + snpeff_field
8771        speff_hgvs_infos = prefix + snpeff_hgvs
8772
8773        # Variants table
8774        table_variants = self.get_table_variants()
8775
8776        # Header
8777        vcf_reader = self.get_header()
8778
8779        # Add columns
8780        added_columns = []
8781
8782        # Explode HGVS field in column
8783        added_columns += self.explode_infos(fields=[snpeff_field])
8784        log.debug(f"snpeff_field={snpeff_field}")
8785        log.debug(f"added_columns={added_columns}")
8786
8787        if snpeff_field in vcf_reader.infos:
8788
8789            # Extract ANN header
8790            ann_description = vcf_reader.infos[snpeff_field].desc
8791            pattern = r"'(.+?)'"
8792            match = re.search(pattern, ann_description)
8793            if match:
8794                ann_header_match = match.group(1).split(" | ")
8795                ann_header = []
8796                ann_header_desc = {}
8797                for i in range(len(ann_header_match)):
8798                    ann_header_info = "".join(
8799                        char for char in ann_header_match[i] if char.isalnum()
8800                    )
8801                    ann_header.append(ann_header_info)
8802                    ann_header_desc[ann_header_info] = ann_header_match[i]
8803                if not ann_header_desc:
8804                    raise ValueError("Invalid header description format")
8805            else:
8806                raise ValueError("Invalid header description format")
8807
8808            # Create variant id
8809            variant_id_column = self.get_variant_id_column()
8810            added_columns += [variant_id_column]
8811
8812            # Create dataframe
8813            dataframe_snpeff_hgvs = self.get_query_to_df(
8814                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8815            )
8816
8817            # Create snpEff columns
8818            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8819                speff_ann_infos
8820            ].apply(
8821                lambda x: explode_snpeff_ann(
8822                    str(x),
8823                    uniquify=uniquify,
8824                    output_format=output_format,
8825                    prefix=output_prefix,
8826                    header=list(ann_header_desc.values()),
8827                )
8828            )
8829
8830            # Header
8831            ann_annotations_prefix = ""
8832            if output_format.upper() in ["JSON"]:
8833                ann_annotations_prefix = f"{output_prefix}="
8834                vcf_reader.infos[output_prefix] = vcf.parser._Info(
8835                    output_prefix,
8836                    ".",
8837                    "String",
8838                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8839                    + " - JSON format",
8840                    "howard calculation",
8841                    "0",
8842                    self.code_type_map.get("String"),
8843                )
8844            else:
8845                for ann_annotation in ann_header:
8846                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
8847                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
8848                        ann_annotation_id,
8849                        ".",
8850                        "String",
8851                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8852                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
8853                        "howard calculation",
8854                        "0",
8855                        self.code_type_map.get("String"),
8856                    )
8857
8858            # Update
8859            sql_update = f"""
8860                UPDATE variants
8861                SET "INFO" = 
8862                    concat(
8863                        CASE
8864                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8865                            THEN ''
8866                            ELSE concat("INFO", ';')
8867                        END,
8868                        CASE 
8869                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8870                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8871                            THEN concat(
8872                                '{ann_annotations_prefix}',
8873                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8874                                )
8875                            ELSE ''
8876                        END
8877                    )
8878                FROM dataframe_snpeff_hgvs
8879                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8880
8881            """
8882            self.conn.execute(sql_update)
8883
8884            # Delete dataframe
8885            del dataframe_snpeff_hgvs
8886            gc.collect()
8887
8888        else:
8889
8890            log.warning(
8891                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8892            )
8893
8894        # Remove added columns
8895        for added_column in added_columns:
8896            self.drop_column(column=added_column)

The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by exploding the HGVS field and updating variant information accordingly.

Parameters
  • uniquify: The uniquify parameter in the calculation_snpeff_ann_explode method is a boolean flag that determines whether the output should be uniquified or not. When set to True, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True
  • output_format: The output_format parameter in the calculation_snpeff_ann_explode function specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields
  • output_prefix: The output_prefix parameter in the calculation_snpeff_ann_explode method is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_
  • snpeff_field: The snpeff_field parameter in the calculation_snpeff_ann_explode function is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
def calculation_extract_nomen(self) -> None:
8898    def calculation_extract_nomen(self) -> None:
8899        """
8900        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
8901        """
8902
8903        # NOMEN field
8904        field_nomen_dict = "NOMEN_DICT"
8905
8906        # NOMEN structure
8907        nomen_dict = {
8908            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
8909            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
8910            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
8911            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
8912            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
8913            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
8914            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
8915            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
8916            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
8917            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
8918        }
8919
8920        # Param
8921        param = self.get_param()
8922
8923        # Threads
8924        threads = self.get_threads()
8925
8926        # Prefix
8927        prefix = self.get_explode_infos_prefix()
8928
8929        # Header
8930        vcf_reader = self.get_header()
8931
8932        # Added columns
8933        added_columns = []
8934
8935        # Get HGVS field
8936        hgvs_field = (
8937            param.get("calculation", {})
8938            .get("calculations", {})
8939            .get("NOMEN", {})
8940            .get("options", {})
8941            .get("hgvs_field", "hgvs")
8942        )
8943
8944        # Get NOMEN pattern
8945        nomen_pattern = (
8946            param.get("calculation", {})
8947            .get("calculations", {})
8948            .get("NOMEN", {})
8949            .get("options", {})
8950            .get("pattern", None)
8951        )
8952
8953        # transcripts list of preference sources
8954        transcripts_sources = {}
8955
8956        # Get transcripts
8957        transcripts_file = (
8958            param.get("calculation", {})
8959            .get("calculations", {})
8960            .get("NOMEN", {})
8961            .get("options", {})
8962            .get("transcripts", None)
8963        )
8964        transcripts_file = full_path(transcripts_file)
8965        if transcripts_file:
8966            if os.path.exists(transcripts_file):
8967                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
8968                transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist()
8969                transcripts_sources["file"] = transcripts_from_file
8970            else:
8971                msg_err = f"Transcript file '{transcripts_file}' does NOT exist"
8972                log.error(msg_err)
8973                raise ValueError(msg_err)
8974
8975        # Get transcripts table
8976        transcripts_table = (
8977            param.get("calculation", {})
8978            .get("calculations", {})
8979            .get("NOMEN", {})
8980            .get("options", {})
8981            .get("transcripts_table", self.get_table_variants())
8982        )
8983        # Get transcripts column
8984        transcripts_column = (
8985            param.get("calculation", {})
8986            .get("calculations", {})
8987            .get("NOMEN", {})
8988            .get("options", {})
8989            .get("transcripts_column", None)
8990        )
8991
8992        if transcripts_table and transcripts_column:
8993            extra_field_transcript = f"{transcripts_table}.{transcripts_column}"
8994            # Explode if not exists
8995            added_columns += self.explode_infos(
8996                fields=[transcripts_column], table=transcripts_table
8997            )
8998        else:
8999            extra_field_transcript = f"NULL"
9000
9001        # Transcripts of preference source order
9002        transcripts_order = (
9003            param.get("calculation", {})
9004            .get("calculations", {})
9005            .get("NOMEN", {})
9006            .get("options", {})
9007            .get("transcripts_order", ["column", "file"])
9008        )
9009
9010        # Transcripts from file
9011        transcripts = transcripts_sources.get("file", [])
9012
9013        # Explode HGVS field in column
9014        added_columns += self.explode_infos(fields=[hgvs_field])
9015
9016        # extra infos
9017        extra_infos = self.get_extra_infos()
9018        extra_field = prefix + hgvs_field
9019
9020        if extra_field in extra_infos:
9021
9022            # Create dataframe
9023            dataframe_hgvs = self.get_query_to_df(
9024                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """
9025            )
9026
9027            # Transcripts rank
9028            transcripts_rank = {
9029                transcript: rank for rank, transcript in enumerate(transcripts, start=1)
9030            }
9031            transcripts_len = len(transcripts_rank)
9032
9033            # Create main NOMEN column
9034            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply(
9035                lambda x: find_nomen(
9036                    hgvs=x.hgvs,
9037                    transcript=x.transcript,
9038                    transcripts=transcripts_rank,
9039                    pattern=nomen_pattern,
9040                    transcripts_source_order=transcripts_order,
9041                    transcripts_len=transcripts_len,
9042                ),
9043                axis=1,
9044            )
9045
9046            # Explode NOMEN Structure and create SQL set for update
9047            sql_nomen_fields = []
9048            for nomen_field in nomen_dict:
9049
9050                # Create VCF header field
9051                vcf_reader.infos[nomen_field] = vcf.parser._Info(
9052                    nomen_field,
9053                    ".",
9054                    "String",
9055                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
9056                    "howard calculation",
9057                    "0",
9058                    self.code_type_map.get("String"),
9059                )
9060
9061                # Add field to SQL query update
9062                sql_nomen_fields.append(
9063                    f"""
9064                        CASE 
9065                            WHEN dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT NULL AND dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT IN ('')
9066                            THEN concat(
9067                                    ';{nomen_field}=',
9068                                    dataframe_hgvs."{field_nomen_dict}"."{nomen_field}"
9069                                )
9070                            ELSE ''
9071                        END
9072                    """
9073                )
9074
9075            # SQL set for update
9076            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
9077
9078            # Update
9079            sql_update = f"""
9080                UPDATE variants
9081                SET "INFO" = 
9082                    concat(
9083                        CASE
9084                            WHEN "INFO" IS NULL
9085                            THEN ''
9086                            ELSE "INFO"
9087                        END,
9088                        {sql_nomen_fields_set}
9089                    )
9090                FROM dataframe_hgvs
9091                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
9092                    AND variants."POS" = dataframe_hgvs."POS" 
9093                    AND variants."REF" = dataframe_hgvs."REF"
9094                    AND variants."ALT" = dataframe_hgvs."ALT"
9095            """
9096            self.conn.execute(sql_update)
9097
9098            # Delete dataframe
9099            del dataframe_hgvs
9100            gc.collect()
9101
9102        # Remove added columns
9103        for added_column in added_columns:
9104            self.drop_column(column=added_column)

This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.

def calculation_find_by_pipeline(self, tag: str = 'findbypipeline') -> None:
9106    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
9107        """
9108        The function `calculation_find_by_pipeline` performs a calculation to find the number of
9109        pipeline/sample for a variant and updates the variant information in a VCF file.
9110
9111        :param tag: The `tag` parameter is a string that represents the annotation field for the
9112        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
9113        VCF header and to update the corresponding field in the variants table, defaults to
9114        findbypipeline
9115        :type tag: str (optional)
9116        """
9117
9118        # if FORMAT and samples
9119        if (
9120            "FORMAT" in self.get_header_columns_as_list()
9121            and self.get_header_sample_list()
9122        ):
9123
9124            # findbypipeline annotation field
9125            findbypipeline_tag = tag
9126
9127            # VCF infos tags
9128            vcf_infos_tags = {
9129                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
9130            }
9131
9132            # Prefix
9133            prefix = self.get_explode_infos_prefix()
9134
9135            # Field
9136            findbypipeline_infos = prefix + findbypipeline_tag
9137
9138            # Variants table
9139            table_variants = self.get_table_variants()
9140
9141            # Header
9142            vcf_reader = self.get_header()
9143
9144            # Create variant id
9145            variant_id_column = self.get_variant_id_column()
9146            added_columns = [variant_id_column]
9147
9148            # variant_id, FORMAT and samples
9149            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9150                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
9151            )
9152
9153            # Create dataframe
9154            dataframe_findbypipeline = self.get_query_to_df(
9155                f""" SELECT {samples_fields} FROM {table_variants} """
9156            )
9157
9158            # Create findbypipeline column
9159            dataframe_findbypipeline[findbypipeline_infos] = (
9160                dataframe_findbypipeline.apply(
9161                    lambda row: findbypipeline(
9162                        row, samples=self.get_header_sample_list()
9163                    ),
9164                    axis=1,
9165                )
9166            )
9167
9168            # Add snpeff_hgvs to header
9169            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
9170                findbypipeline_tag,
9171                ".",
9172                "String",
9173                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
9174                "howard calculation",
9175                "0",
9176                self.code_type_map.get("String"),
9177            )
9178
9179            # Update
9180            sql_update = f"""
9181                UPDATE variants
9182                SET "INFO" = 
9183                    concat(
9184                        CASE
9185                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9186                            THEN ''
9187                            ELSE concat("INFO", ';')
9188                        END,
9189                        CASE 
9190                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
9191                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
9192                            THEN concat(
9193                                    '{findbypipeline_tag}=',
9194                                    dataframe_findbypipeline."{findbypipeline_infos}"
9195                                )
9196                            ELSE ''
9197                        END
9198                    )
9199                FROM dataframe_findbypipeline
9200                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
9201            """
9202            self.conn.execute(sql_update)
9203
9204            # Remove added columns
9205            for added_column in added_columns:
9206                self.drop_column(column=added_column)
9207
9208            # Delete dataframe
9209            del dataframe_findbypipeline
9210            gc.collect()

The function calculation_find_by_pipeline performs a calculation to find the number of pipeline/sample for a variant and updates the variant information in a VCF file.

Parameters
  • tag: The tag parameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
def calculation_genotype_concordance(self) -> None:
9212    def calculation_genotype_concordance(self) -> None:
9213        """
9214        The function `calculation_genotype_concordance` calculates the genotype concordance for
9215        multi-caller VCF files and updates the variant information in the database.
9216        """
9217
9218        # if FORMAT and samples
9219        if (
9220            "FORMAT" in self.get_header_columns_as_list()
9221            and self.get_header_sample_list()
9222        ):
9223
9224            # genotypeconcordance annotation field
9225            genotypeconcordance_tag = "genotypeconcordance"
9226
9227            # VCF infos tags
9228            vcf_infos_tags = {
9229                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
9230            }
9231
9232            # Prefix
9233            prefix = self.get_explode_infos_prefix()
9234
9235            # Field
9236            genotypeconcordance_infos = prefix + genotypeconcordance_tag
9237
9238            # Variants table
9239            table_variants = self.get_table_variants()
9240
9241            # Header
9242            vcf_reader = self.get_header()
9243
9244            # Create variant id
9245            variant_id_column = self.get_variant_id_column()
9246            added_columns = [variant_id_column]
9247
9248            # variant_id, FORMAT and samples
9249            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9250                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
9251            )
9252
9253            # Create dataframe
9254            dataframe_genotypeconcordance = self.get_query_to_df(
9255                f""" SELECT {samples_fields} FROM {table_variants} """
9256            )
9257
9258            # Create genotypeconcordance column
9259            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
9260                dataframe_genotypeconcordance.apply(
9261                    lambda row: genotypeconcordance(
9262                        row, samples=self.get_header_sample_list()
9263                    ),
9264                    axis=1,
9265                )
9266            )
9267
9268            # Add genotypeconcordance to header
9269            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
9270                genotypeconcordance_tag,
9271                ".",
9272                "String",
9273                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
9274                "howard calculation",
9275                "0",
9276                self.code_type_map.get("String"),
9277            )
9278
9279            # Update
9280            sql_update = f"""
9281                UPDATE variants
9282                SET "INFO" = 
9283                    concat(
9284                        CASE
9285                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9286                            THEN ''
9287                            ELSE concat("INFO", ';')
9288                        END,
9289                        CASE
9290                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
9291                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
9292                            THEN concat(
9293                                    '{genotypeconcordance_tag}=',
9294                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
9295                                )
9296                            ELSE ''
9297                        END
9298                    )
9299                FROM dataframe_genotypeconcordance
9300                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
9301            """
9302            self.conn.execute(sql_update)
9303
9304            # Remove added columns
9305            for added_column in added_columns:
9306                self.drop_column(column=added_column)
9307
9308            # Delete dataframe
9309            del dataframe_genotypeconcordance
9310            gc.collect()

The function calculation_genotype_concordance calculates the genotype concordance for multi-caller VCF files and updates the variant information in the database.

def calculation_barcode(self, tag: str = 'barcode') -> None:
9312    def calculation_barcode(self, tag: str = "barcode") -> None:
9313        """
9314        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
9315        updates the INFO field in the file with the calculated barcode values.
9316
9317        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
9318        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
9319        the default tag name is set to "barcode", defaults to barcode
9320        :type tag: str (optional)
9321        """
9322
9323        # if FORMAT and samples
9324        if (
9325            "FORMAT" in self.get_header_columns_as_list()
9326            and self.get_header_sample_list()
9327        ):
9328
9329            # barcode annotation field
9330            if not tag:
9331                tag = "barcode"
9332
9333            # VCF infos tags
9334            vcf_infos_tags = {
9335                tag: "barcode calculation (VaRank)",
9336            }
9337
9338            # Prefix
9339            prefix = self.get_explode_infos_prefix()
9340
9341            # Field
9342            barcode_infos = prefix + tag
9343
9344            # Variants table
9345            table_variants = self.get_table_variants()
9346
9347            # Header
9348            vcf_reader = self.get_header()
9349
9350            # Create variant id
9351            variant_id_column = self.get_variant_id_column()
9352            added_columns = [variant_id_column]
9353
9354            # variant_id, FORMAT and samples
9355            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9356                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
9357            )
9358
9359            # Create dataframe
9360            dataframe_barcode = self.get_query_to_df(
9361                f""" SELECT {samples_fields} FROM {table_variants} """
9362            )
9363
9364            # Create barcode column
9365            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
9366                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
9367            )
9368
9369            # Add barcode to header
9370            vcf_reader.infos[tag] = vcf.parser._Info(
9371                tag,
9372                ".",
9373                "String",
9374                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
9375                "howard calculation",
9376                "0",
9377                self.code_type_map.get("String"),
9378            )
9379
9380            # Update
9381            sql_update = f"""
9382                UPDATE {table_variants}
9383                SET "INFO" = 
9384                    concat(
9385                        CASE
9386                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9387                            THEN ''
9388                            ELSE concat("INFO", ';')
9389                        END,
9390                        CASE
9391                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
9392                            AND dataframe_barcode."{barcode_infos}" NOT NULL
9393                            THEN concat(
9394                                    '{tag}=',
9395                                    dataframe_barcode."{barcode_infos}"
9396                                )
9397                            ELSE ''
9398                        END
9399                    )
9400                FROM dataframe_barcode
9401                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
9402            """
9403            self.conn.execute(sql_update)
9404
9405            # Remove added columns
9406            for added_column in added_columns:
9407                self.drop_column(column=added_column)
9408
9409            # Delete dataframe
9410            del dataframe_barcode
9411            gc.collect()

The calculation_barcode function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode function is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
def calculation_barcode_family(self, tag: str = 'BCF') -> None:
9413    def calculation_barcode_family(self, tag: str = "BCF") -> None:
9414        """
9415        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
9416        and updates the INFO field in the file with the calculated barcode values.
9417
9418        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
9419        the barcode tag that will be added to the VCF file during the calculation process. If no value
9420        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
9421        :type tag: str (optional)
9422        """
9423
9424        # if FORMAT and samples
9425        if (
9426            "FORMAT" in self.get_header_columns_as_list()
9427            and self.get_header_sample_list()
9428        ):
9429
9430            # barcode annotation field
9431            if not tag:
9432                tag = "BCF"
9433
9434            # VCF infos tags
9435            vcf_infos_tags = {
9436                tag: "barcode family calculation",
9437                f"{tag}S": "barcode family samples",
9438            }
9439
9440            # Param
9441            param = self.get_param()
9442            log.debug(f"param={param}")
9443
9444            # Prefix
9445            prefix = self.get_explode_infos_prefix()
9446
9447            # PED param
9448            ped = (
9449                param.get("calculation", {})
9450                .get("calculations", {})
9451                .get("BARCODEFAMILY", {})
9452                .get("family_pedigree", None)
9453            )
9454            log.debug(f"ped={ped}")
9455
9456            # Load PED
9457            if ped:
9458
9459                # Pedigree is a file
9460                if isinstance(ped, str) and os.path.exists(full_path(ped)):
9461                    log.debug("Pedigree is file")
9462                    with open(full_path(ped)) as ped:
9463                        ped = yaml.safe_load(ped)
9464
9465                # Pedigree is a string
9466                elif isinstance(ped, str):
9467                    log.debug("Pedigree is str")
9468                    try:
9469                        ped = json.loads(ped)
9470                        log.debug("Pedigree is json str")
9471                    except ValueError as e:
9472                        ped_samples = ped.split(",")
9473                        ped = {}
9474                        for ped_sample in ped_samples:
9475                            ped[ped_sample] = ped_sample
9476
9477                # Pedigree is a dict
9478                elif isinstance(ped, dict):
9479                    log.debug("Pedigree is dict")
9480
9481                # Pedigree is not well formatted
9482                else:
9483                    msg_error = "Pedigree not well formatted"
9484                    log.error(msg_error)
9485                    raise ValueError(msg_error)
9486
9487                # Construct list
9488                ped_samples = list(ped.values())
9489
9490            else:
9491                log.debug("Pedigree not defined. Take all samples")
9492                ped_samples = self.get_header_sample_list()
9493                ped = {}
9494                for ped_sample in ped_samples:
9495                    ped[ped_sample] = ped_sample
9496
9497            # Check pedigree
9498            if not ped or len(ped) == 0:
9499                msg_error = f"Error in pedigree: samples {ped_samples}"
9500                log.error(msg_error)
9501                raise ValueError(msg_error)
9502
9503            # Log
9504            log.info(
9505                "Calculation 'BARCODEFAMILY' - Samples: "
9506                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
9507            )
9508            log.debug(f"ped_samples={ped_samples}")
9509
9510            # Field
9511            barcode_infos = prefix + tag
9512
9513            # Variants table
9514            table_variants = self.get_table_variants()
9515
9516            # Header
9517            vcf_reader = self.get_header()
9518
9519            # Create variant id
9520            variant_id_column = self.get_variant_id_column()
9521            added_columns = [variant_id_column]
9522
9523            # variant_id, FORMAT and samples
9524            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9525                [f""" "{sample}" """ for sample in ped_samples]
9526            )
9527
9528            # Create dataframe
9529            dataframe_barcode = self.get_query_to_df(
9530                f""" SELECT {samples_fields} FROM {table_variants} """
9531            )
9532
9533            # Create barcode column
9534            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
9535                lambda row: barcode(row, samples=ped_samples), axis=1
9536            )
9537
9538            # Add barcode family to header
9539            # Add vaf_normalization to header
9540            vcf_reader.formats[tag] = vcf.parser._Format(
9541                id=tag,
9542                num=".",
9543                type="String",
9544                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
9545                type_code=self.code_type_map.get("String"),
9546            )
9547            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
9548                id=f"{tag}S",
9549                num=".",
9550                type="String",
9551                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
9552                type_code=self.code_type_map.get("String"),
9553            )
9554
9555            # Update
9556            # for sample in ped_samples:
9557            sql_update_set = []
9558            for sample in self.get_header_sample_list() + ["FORMAT"]:
9559                if sample in ped_samples:
9560                    value = f'dataframe_barcode."{barcode_infos}"'
9561                    value_samples = (
9562                        "'"
9563                        + ",".join([f""" "{sample}" """ for sample in ped_samples])
9564                        + "'"
9565                    )
9566                    ped_samples
9567                elif sample == "FORMAT":
9568                    value = f"'{tag}'"
9569                    value_samples = f"'{tag}S'"
9570                else:
9571                    value = "'.'"
9572                    value_samples = "'.'"
9573                format_regex = r"[a-zA-Z0-9\s]"
9574                sql_update_set.append(
9575                    f"""
9576                        "{sample}" = 
9577                        concat(
9578                            CASE
9579                                WHEN {table_variants}."{sample}" = './.'
9580                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
9581                                ELSE {table_variants}."{sample}"
9582                            END,
9583                            ':',
9584                            {value},
9585                            ':',
9586                            {value_samples}
9587                        )
9588                    """
9589                )
9590
9591            sql_update_set_join = ", ".join(sql_update_set)
9592            sql_update = f"""
9593                UPDATE {table_variants}
9594                SET {sql_update_set_join}
9595                FROM dataframe_barcode
9596                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
9597            """
9598            self.conn.execute(sql_update)
9599
9600            # Remove added columns
9601            for added_column in added_columns:
9602                self.drop_column(column=added_column)
9603
9604            # Delete dataframe
9605            del dataframe_barcode
9606            gc.collect()

The calculation_barcode_family function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode_family function is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for the tag parameter, the default value used is "BCF", defaults to BCF
def calculation_trio(self) -> None:
9608    def calculation_trio(self) -> None:
9609        """
9610        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
9611        information to the INFO field of each variant.
9612        """
9613
9614        # if FORMAT and samples
9615        if (
9616            "FORMAT" in self.get_header_columns_as_list()
9617            and self.get_header_sample_list()
9618        ):
9619
9620            # trio annotation field
9621            trio_tag = "trio"
9622
9623            # VCF infos tags
9624            vcf_infos_tags = {
9625                "trio": "trio calculation",
9626            }
9627
9628            # Param
9629            param = self.get_param()
9630
9631            # Prefix
9632            prefix = self.get_explode_infos_prefix()
9633
9634            # Trio param
9635            trio_ped = (
9636                param.get("calculation", {})
9637                .get("calculations", {})
9638                .get("TRIO", {})
9639                .get("trio_pedigree", None)
9640            )
9641
9642            # Load trio
9643            if trio_ped:
9644
9645                # Trio pedigree is a file
9646                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
9647                    log.debug("TRIO pedigree is file")
9648                    with open(full_path(trio_ped)) as trio_ped:
9649                        trio_ped = yaml.safe_load(trio_ped)
9650
9651                # Trio pedigree is a string
9652                elif isinstance(trio_ped, str):
9653                    log.debug("TRIO pedigree is str")
9654                    try:
9655                        trio_ped = json.loads(trio_ped)
9656                        log.debug("TRIO pedigree is json str")
9657                    except ValueError as e:
9658                        trio_samples = trio_ped.split(",")
9659                        if len(trio_samples) == 3:
9660                            trio_ped = {
9661                                "father": trio_samples[0],
9662                                "mother": trio_samples[1],
9663                                "child": trio_samples[2],
9664                            }
9665                            log.debug("TRIO pedigree is list str")
9666                        else:
9667                            msg_error = "TRIO pedigree not well formatted"
9668                            log.error(msg_error)
9669                            raise ValueError(msg_error)
9670
9671                # Trio pedigree is a dict
9672                elif isinstance(trio_ped, dict):
9673                    log.debug("TRIO pedigree is dict")
9674
9675                # Trio pedigree is not well formatted
9676                else:
9677                    msg_error = "TRIO pedigree not well formatted"
9678                    log.error(msg_error)
9679                    raise ValueError(msg_error)
9680
9681                # Construct trio list
9682                trio_samples = [
9683                    trio_ped.get("father", ""),
9684                    trio_ped.get("mother", ""),
9685                    trio_ped.get("child", ""),
9686                ]
9687
9688            else:
9689                log.debug("TRIO pedigree not defined. Take the first 3 samples")
9690                samples_list = self.get_header_sample_list()
9691                if len(samples_list) >= 3:
9692                    trio_samples = self.get_header_sample_list()[0:3]
9693                    trio_ped = {
9694                        "father": trio_samples[0],
9695                        "mother": trio_samples[1],
9696                        "child": trio_samples[2],
9697                    }
9698                else:
9699                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
9700                    log.error(msg_error)
9701                    raise ValueError(msg_error)
9702
9703            # Check trio pedigree
9704            if not trio_ped or len(trio_ped) != 3:
9705                msg_error = f"Error in TRIO pedigree: {trio_ped}"
9706                log.error(msg_error)
9707                raise ValueError(msg_error)
9708
9709            # Log
9710            log.info(
9711                f"Calculation 'TRIO' - Samples: "
9712                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
9713            )
9714
9715            # Field
9716            trio_infos = prefix + trio_tag
9717
9718            # Variants table
9719            table_variants = self.get_table_variants()
9720
9721            # Header
9722            vcf_reader = self.get_header()
9723
9724            # Create variant id
9725            variant_id_column = self.get_variant_id_column()
9726            added_columns = [variant_id_column]
9727
9728            # variant_id, FORMAT and samples
9729            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9730                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
9731            )
9732
9733            # Create dataframe
9734            dataframe_trio = self.get_query_to_df(
9735                f""" SELECT {samples_fields} FROM {table_variants} """
9736            )
9737
9738            # Create trio column
9739            dataframe_trio[trio_infos] = dataframe_trio.apply(
9740                lambda row: trio(row, samples=trio_samples), axis=1
9741            )
9742
9743            # Add trio to header
9744            vcf_reader.infos[trio_tag] = vcf.parser._Info(
9745                trio_tag,
9746                ".",
9747                "String",
9748                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
9749                "howard calculation",
9750                "0",
9751                self.code_type_map.get("String"),
9752            )
9753
9754            # Update
9755            sql_update = f"""
9756                UPDATE {table_variants}
9757                SET "INFO" = 
9758                    concat(
9759                        CASE
9760                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9761                            THEN ''
9762                            ELSE concat("INFO", ';')
9763                        END,
9764                        CASE
9765                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
9766                             AND dataframe_trio."{trio_infos}" NOT NULL
9767                            THEN concat(
9768                                    '{trio_tag}=',
9769                                    dataframe_trio."{trio_infos}"
9770                                )
9771                            ELSE ''
9772                        END
9773                    )
9774                FROM dataframe_trio
9775                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
9776            """
9777            self.conn.execute(sql_update)
9778
9779            # Remove added columns
9780            for added_column in added_columns:
9781                self.drop_column(column=added_column)
9782
9783            # Delete dataframe
9784            del dataframe_trio
9785            gc.collect()

The calculation_trio function performs trio calculations on a VCF file by adding trio information to the INFO field of each variant.

def calculation_vaf_normalization(self) -> None:
9787    def calculation_vaf_normalization(self) -> None:
9788        """
9789        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
9790        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
9791        :return: The function does not return anything.
9792        """
9793
9794        # if FORMAT and samples
9795        if (
9796            "FORMAT" in self.get_header_columns_as_list()
9797            and self.get_header_sample_list()
9798        ):
9799
9800            # vaf_normalization annotation field
9801            vaf_normalization_tag = "VAF"
9802
9803            # VCF infos tags
9804            vcf_infos_tags = {
9805                "VAF": "VAF Variant Frequency",
9806            }
9807
9808            # Prefix
9809            prefix = self.get_explode_infos_prefix()
9810
9811            # Variants table
9812            table_variants = self.get_table_variants()
9813
9814            # Header
9815            vcf_reader = self.get_header()
9816
9817            # Do not calculate if VAF already exists
9818            if "VAF" in vcf_reader.formats:
9819                log.debug("VAF already on genotypes")
9820                return
9821
9822            # Create variant id
9823            variant_id_column = self.get_variant_id_column()
9824            added_columns = [variant_id_column]
9825
9826            # variant_id, FORMAT and samples
9827            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9828                f""" "{sample}" """ for sample in self.get_header_sample_list()
9829            )
9830
9831            # Create dataframe
9832            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
9833            log.debug(f"query={query}")
9834            dataframe_vaf_normalization = self.get_query_to_df(query=query)
9835
9836            vaf_normalization_set = []
9837
9838            # for each sample vaf_normalization
9839            for sample in self.get_header_sample_list():
9840                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
9841                    lambda row: vaf_normalization(row, sample=sample), axis=1
9842                )
9843                vaf_normalization_set.append(
9844                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
9845                )
9846
9847            # Add VAF to FORMAT
9848            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
9849                "FORMAT"
9850            ].apply(lambda x: str(x) + ":VAF")
9851            vaf_normalization_set.append(
9852                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
9853            )
9854
9855            # Add vaf_normalization to header
9856            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
9857                id=vaf_normalization_tag,
9858                num="1",
9859                type="Float",
9860                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
9861                type_code=self.code_type_map.get("Float"),
9862            )
9863
9864            # Create fields to add in INFO
9865            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
9866
9867            # Update
9868            sql_update = f"""
9869                UPDATE {table_variants}
9870                SET {sql_vaf_normalization_set}
9871                FROM dataframe_vaf_normalization
9872                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
9873
9874            """
9875            self.conn.execute(sql_update)
9876
9877            # Remove added columns
9878            for added_column in added_columns:
9879                self.drop_column(column=added_column)
9880
9881            # Delete dataframe
9882            del dataframe_vaf_normalization
9883            gc.collect()

The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency) normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.

Returns

The function does not return anything.

def calculation_genotype_stats(self, info: str = 'VAF') -> None:
 9885    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9886        """
 9887        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9888        field in a VCF file and updates the INFO column of the variants table with the calculated
 9889        statistics.
 9890
 9891        :param info: The `info` parameter is a string that represents the type of information for which
 9892        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9893        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9894        maximum value, the mean, the median, defaults to VAF
 9895        :type info: str (optional)
 9896        """
 9897
 9898        # if FORMAT and samples
 9899        if (
 9900            "FORMAT" in self.get_header_columns_as_list()
 9901            and self.get_header_sample_list()
 9902        ):
 9903
 9904            # vaf_stats annotation field
 9905            vaf_stats_tag = info + "_stats"
 9906
 9907            # VCF infos tags
 9908            vcf_infos_tags = {
 9909                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9910                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9911                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9912                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9913                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9914                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9915                info
 9916                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9917            }
 9918
 9919            # Prefix
 9920            prefix = self.get_explode_infos_prefix()
 9921
 9922            # Field
 9923            vaf_stats_infos = prefix + vaf_stats_tag
 9924
 9925            # Variants table
 9926            table_variants = self.get_table_variants()
 9927
 9928            # Header
 9929            vcf_reader = self.get_header()
 9930
 9931            # Create variant id
 9932            variant_id_column = self.get_variant_id_column()
 9933            added_columns = [variant_id_column]
 9934
 9935            # variant_id, FORMAT and samples
 9936            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9937                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9938            )
 9939
 9940            # Create dataframe
 9941            dataframe_vaf_stats = self.get_query_to_df(
 9942                f""" SELECT {samples_fields} FROM {table_variants} """
 9943            )
 9944
 9945            # Create vaf_stats column
 9946            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
 9947                lambda row: genotype_stats(
 9948                    row, samples=self.get_header_sample_list(), info=info
 9949                ),
 9950                axis=1,
 9951            )
 9952
 9953            # List of vcf tags
 9954            sql_vaf_stats_fields = []
 9955
 9956            # Check all VAF stats infos
 9957            for stat in vcf_infos_tags:
 9958
 9959                # Extract stats
 9960                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
 9961                    lambda x: dict(x).get(stat, "")
 9962                )
 9963
 9964                # Add snpeff_hgvs to header
 9965                vcf_reader.infos[stat] = vcf.parser._Info(
 9966                    stat,
 9967                    ".",
 9968                    "String",
 9969                    vcf_infos_tags.get(stat, "genotype statistics"),
 9970                    "howard calculation",
 9971                    "0",
 9972                    self.code_type_map.get("String"),
 9973                )
 9974
 9975                if len(sql_vaf_stats_fields):
 9976                    sep = ";"
 9977                else:
 9978                    sep = ""
 9979
 9980                # Create fields to add in INFO
 9981                sql_vaf_stats_fields.append(
 9982                    f"""
 9983                        CASE
 9984                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
 9985                            THEN concat(
 9986                                    '{sep}{stat}=',
 9987                                    dataframe_vaf_stats."{stat}"
 9988                                )
 9989                            ELSE ''
 9990                        END
 9991                    """
 9992                )
 9993
 9994            # SQL set for update
 9995            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
 9996
 9997            # Update
 9998            sql_update = f"""
 9999                UPDATE {table_variants}
10000                SET "INFO" = 
10001                    concat(
10002                        CASE
10003                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10004                            THEN ''
10005                            ELSE concat("INFO", ';')
10006                        END,
10007                        {sql_vaf_stats_fields_set}
10008                    )
10009                FROM dataframe_vaf_stats
10010                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
10011
10012            """
10013            self.conn.execute(sql_update)
10014
10015            # Remove added columns
10016            for added_column in added_columns:
10017                self.drop_column(column=added_column)
10018
10019            # Delete dataframe
10020            del dataframe_vaf_stats
10021            gc.collect()

The calculation_genotype_stats function calculates genotype statistics for a given information field in a VCF file and updates the INFO column of the variants table with the calculated statistics.

Parameters
  • info: The info parameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF
def calculation_transcripts_annotation(self, info_json: str = None, info_format: str = None) -> None:
10023    def calculation_transcripts_annotation(
10024        self, info_json: str = None, info_format: str = None
10025    ) -> None:
10026        """
10027        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
10028        field to it if transcripts are available.
10029
10030        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
10031        is a string parameter that represents the information field to be used in the transcripts JSON.
10032        It is used to specify the JSON format for the transcripts information. If no value is provided
10033        when calling the method, it defaults to "
10034        :type info_json: str
10035        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
10036        method is a string parameter that specifies the format of the information field to be used in
10037        the transcripts JSON. It is used to define the format of the information field
10038        :type info_format: str
10039        """
10040
10041        # Create transcripts table
10042        transcripts_table = self.create_transcript_view()
10043
10044        # Add info field
10045        if transcripts_table:
10046            self.transcript_view_to_variants(
10047                transcripts_table=transcripts_table,
10048                transcripts_info_field_json=info_json,
10049                transcripts_info_field_format=info_format,
10050            )
10051        else:
10052            log.info("No Transcripts to process. Check param.json file configuration")

The calculation_transcripts_annotation function creates a transcripts table and adds an info field to it if transcripts are available.

Parameters
  • info_json: The info_json parameter in the calculation_transcripts_annotation method is a string parameter that represents the information field to be used in the transcripts JSON. It is used to specify the JSON format for the transcripts information. If no value is provided when calling the method, it defaults to "
  • info_format: The info_format parameter in the calculation_transcripts_annotation method is a string parameter that specifies the format of the information field to be used in the transcripts JSON. It is used to define the format of the information field
def calculation_transcripts_prioritization(self) -> None:
10054    def calculation_transcripts_prioritization(self) -> None:
10055        """
10056        The function `calculation_transcripts_prioritization` creates a transcripts table and
10057        prioritizes transcripts based on certain criteria.
10058        """
10059
10060        # Create transcripts table
10061        transcripts_table = self.create_transcript_view()
10062
10063        # Add info field
10064        if transcripts_table:
10065            self.transcripts_prioritization(transcripts_table=transcripts_table)
10066        else:
10067            log.info("No Transcripts to process. Check param.json file configuration")

The function calculation_transcripts_prioritization creates a transcripts table and prioritizes transcripts based on certain criteria.

def calculation_transcripts_export(self) -> None:
10069    def calculation_transcripts_export(self) -> None:
10070        """ """
10071
10072        # Create transcripts table
10073        transcripts_table = self.create_transcript_view()
10074
10075        # Add info field
10076        if transcripts_table:
10077            self.transcripts_export(transcripts_table=transcripts_table)
10078        else:
10079            log.info("No Transcripts to process. Check param.json file configuration")
def transcripts_export(self, transcripts_table: str = None, param: dict = {}) -> bool:
10085    def transcripts_export(
10086        self, transcripts_table: str = None, param: dict = {}
10087    ) -> bool:
10088        """ """
10089
10090        log.debug("Start transcripts export...")
10091
10092        # Param
10093        if not param:
10094            param = self.get_param()
10095
10096        # Param export
10097        param_transcript_export = param.get("transcripts", {}).get("export", {})
10098
10099        # Output file
10100        transcripts_export_output = param_transcript_export.get("output", None)
10101
10102        if not param_transcript_export or not transcripts_export_output:
10103            log.warning(f"No transcriipts export parameters defined!")
10104            return False
10105
10106        # List of transcripts annotations
10107        query_describe = f"""
10108            SELECT column_name
10109            FROM (
10110                    DESCRIBE SELECT * FROM {transcripts_table}
10111                )
10112            WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO')
10113        """
10114        transcripts_annotations_list = list(
10115            self.get_query_to_df(query=query_describe)["column_name"]
10116        )
10117
10118        # Create transcripts table for export
10119        transcripts_table_export = f"{transcripts_table}_export_" + "".join(
10120            random.choices(string.ascii_uppercase + string.digits, k=10)
10121        )
10122        query_create_transcripts_table_export = f"""
10123            CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table})
10124        """
10125        self.execute_query(query=query_create_transcripts_table_export)
10126
10127        # Output file format
10128        transcripts_export_output_format = get_file_format(
10129            filename=transcripts_export_output
10130        )
10131
10132        # Format VCF - construct INFO
10133        if transcripts_export_output_format in ["vcf"]:
10134
10135            # Construct query update INFO and header
10136            query_update_info = []
10137            for field in transcripts_annotations_list:
10138
10139                # If field not in header
10140                if field not in self.get_header_infos_list():
10141
10142                    # Add PZ Transcript in header
10143                    self.get_header().infos[field] = vcf.parser._Info(
10144                        field,
10145                        ".",
10146                        "String",
10147                        f"Annotation '{field}' from transcript view",
10148                        "unknown",
10149                        "unknown",
10150                        0,
10151                    )
10152
10153                # Add field as INFO/tag
10154                query_update_info.append(
10155                    f"""
10156                        CASE
10157                            WHEN "{field}" IS NOT NULL
10158                            THEN concat('{field}=', "{field}", ';')    
10159                            ELSE ''     
10160                        END
10161                        """
10162                )
10163
10164            # Query param
10165            query_update_info_value = (
10166                f""" concat('',  {", ".join(query_update_info)}) """
10167            )
10168            query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """
10169
10170        else:
10171
10172            # Query param
10173            query_update_info_value = f""" NULL """
10174            query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """
10175
10176        # Update query INFO column
10177        query_update = f"""
10178            UPDATE {transcripts_table_export}
10179            SET INFO = {query_update_info_value}
10180
10181        """
10182        self.execute_query(query=query_update)
10183
10184        # Export
10185        self.export_output(
10186            output_file=transcripts_export_output,
10187            query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """,
10188        )
10189
10190        # Drop transcripts export table
10191        query_drop_transcripts_table_export = f"""
10192            DROP TABLE {transcripts_table_export}
10193        """
10194        self.execute_query(query=query_drop_transcripts_table_export)
def transcripts_prioritization(self, transcripts_table: str = None, param: dict = {}) -> bool:
10196    def transcripts_prioritization(
10197        self, transcripts_table: str = None, param: dict = {}
10198    ) -> bool:
10199        """
10200        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
10201        and updates the variants table with the prioritized information.
10202
10203        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10204        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
10205        This parameter is used to identify the table where the transcripts data is stored for the
10206        prioritization process
10207        :type transcripts_table: str
10208        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
10209        that contains various configuration settings for the prioritization process of transcripts. It
10210        is used to customize the behavior of the prioritization algorithm and includes settings such as
10211        the prefix for prioritization fields, default profiles, and other
10212        :type param: dict
10213        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
10214        transcripts prioritization process is successfully completed, and `False` if there are any
10215        issues or if no profile is defined for transcripts prioritization.
10216        """
10217
10218        log.debug("Start transcripts prioritization...")
10219
10220        # Param
10221        if not param:
10222            param = self.get_param()
10223
10224        # Variants table
10225        table_variants = self.get_table_variants()
10226
10227        # Transcripts table
10228        if transcripts_table is None:
10229            transcripts_table = self.create_transcript_view(
10230                transcripts_table="transcripts", param=param
10231            )
10232        if transcripts_table is None:
10233            msg_err = "No Transcripts table availalble"
10234            log.error(msg_err)
10235            raise ValueError(msg_err)
10236        log.debug(f"transcripts_table={transcripts_table}")
10237
10238        # Get transcripts columns
10239        columns_as_list_query = f"""
10240            DESCRIBE {transcripts_table}
10241        """
10242        columns_as_list = list(
10243            self.get_query_to_df(columns_as_list_query)["column_name"]
10244        )
10245
10246        # Create INFO if not exists
10247        if "INFO" not in columns_as_list:
10248            query_add_info = f"""
10249                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
10250            """
10251            self.execute_query(query_add_info)
10252
10253        # Prioritization param and Force only PZ Score and Flag
10254        pz_param = param.get("transcripts", {}).get("prioritization", {})
10255
10256        # PZ profile by default
10257        pz_profile_default = (
10258            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
10259        )
10260
10261        # Exit if no profile
10262        if pz_profile_default is None:
10263            log.warning("No profile defined for transcripts prioritization")
10264            return False
10265
10266        # PZ fields
10267        pz_param_pzfields = {}
10268
10269        # PZ field transcripts
10270        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
10271
10272        # Add PZ Transcript in header
10273        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
10274            pz_fields_transcripts,
10275            ".",
10276            "String",
10277            f"Transcript selected from prioritization process, profile {pz_profile_default}",
10278            "unknown",
10279            "unknown",
10280            code_type_map["String"],
10281        )
10282
10283        # Mandatory fields
10284        pz_mandatory_fields_list = [
10285            "Score",
10286            "Flag",
10287            "Tags",
10288            "Comment",
10289            "Infos",
10290            "Class",
10291        ]
10292        pz_mandatory_fields = []
10293        for pz_mandatory_field in pz_mandatory_fields_list:
10294            pz_mandatory_fields.append(
10295                pz_param.get("pzprefix", "PTZ") + pz_mandatory_field
10296            )
10297
10298        # PZ fields in param
10299        for pz_field in pz_param.get("pzfields", []):
10300            if pz_field in pz_mandatory_fields_list:
10301                pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = (
10302                    pz_param.get("pzprefix", "PTZ") + pz_field
10303                )
10304            else:
10305                pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field
10306                pz_param_pzfields[pz_field] = pz_field_new
10307
10308                # Add PZ Transcript in header
10309                self.get_header().infos[pz_field_new] = vcf.parser._Info(
10310                    pz_field_new,
10311                    ".",
10312                    "String",
10313                    f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}",
10314                    "unknown",
10315                    "unknown",
10316                    code_type_map["String"],
10317                )
10318
10319        # PZ fields param
10320        pz_param["pzfields"] = pz_mandatory_fields
10321
10322        # Prioritization
10323        prioritization_result = self.prioritization(
10324            table=transcripts_table,
10325            pz_param=param.get("transcripts", {}).get("prioritization", {}),
10326        )
10327        if not prioritization_result:
10328            log.warning("Transcripts prioritization not processed")
10329            return False
10330
10331        # PZ fields sql query
10332        query_update_select_list = []
10333        query_update_concat_list = []
10334        query_update_order_list = []
10335        for pz_param_pzfield in set(
10336            list(pz_param_pzfields.keys()) + pz_mandatory_fields
10337        ):
10338            query_update_select_list.append(f" {pz_param_pzfield}, ")
10339
10340        for pz_param_pzfield in pz_param_pzfields:
10341            query_update_concat_list.append(
10342                f"""
10343                    , CASE 
10344                        WHEN {pz_param_pzfield} IS NOT NULL
10345                        THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield})
10346                        ELSE ''
10347                    END
10348                """
10349            )
10350
10351        # Order by
10352        pz_orders = (
10353            param.get("transcripts", {})
10354            .get("prioritization", {})
10355            .get("prioritization_transcripts_order", {})
10356        )
10357        if not pz_orders:
10358            pz_orders = {
10359                pz_param.get("pzprefix", "PTZ") + "Flag": "DESC",
10360                pz_param.get("pzprefix", "PTZ") + "Score": "DESC",
10361            }
10362        for pz_order in pz_orders:
10363            query_update_order_list.append(
10364                f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """
10365            )
10366
10367        # Fields to explode
10368        fields_to_explode = (
10369            list(pz_param_pzfields.keys())
10370            + pz_mandatory_fields
10371            + list(pz_orders.keys())
10372        )
10373        # Remove transcript column as a specific transcript column
10374        if "transcript" in fields_to_explode:
10375            fields_to_explode.remove("transcript")
10376
10377        # Fields intranscripts table
10378        query_transcripts_table = f"""
10379            DESCRIBE SELECT * FROM {transcripts_table}
10380        """
10381        query_transcripts_table = self.get_query_to_df(query=query_transcripts_table)
10382
10383        # Check fields to explode
10384        for field_to_explode in fields_to_explode:
10385            if field_to_explode not in self.get_header_infos_list() + list(
10386                query_transcripts_table.column_name
10387            ):
10388                msg_err = f"INFO/{field_to_explode} NOT IN header"
10389                log.error(msg_err)
10390                raise ValueError(msg_err)
10391
10392        # Explode fields to explode
10393        self.explode_infos(
10394            table=transcripts_table,
10395            fields=fields_to_explode,
10396        )
10397
10398        # Transcript preference file
10399        transcripts_preference_file = (
10400            param.get("transcripts", {})
10401            .get("prioritization", {})
10402            .get("prioritization_transcripts", {})
10403        )
10404        transcripts_preference_file = full_path(transcripts_preference_file)
10405
10406        # Transcript preference forced
10407        transcript_preference_force = (
10408            param.get("transcripts", {})
10409            .get("prioritization", {})
10410            .get("prioritization_transcripts_force", False)
10411        )
10412        # Transcript version forced
10413        transcript_version_force = (
10414            param.get("transcripts", {})
10415            .get("prioritization", {})
10416            .get("prioritization_transcripts_version_force", False)
10417        )
10418
10419        # Transcripts Ranking
10420        if transcripts_preference_file:
10421
10422            # Transcripts file to dataframe
10423            if os.path.exists(transcripts_preference_file):
10424                transcripts_preference_dataframe = transcripts_file_to_df(
10425                    transcripts_preference_file
10426                )
10427            else:
10428                log.error(
10429                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10430                )
10431                raise ValueError(
10432                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10433                )
10434
10435            # Order by depending to transcript preference forcing
10436            if transcript_preference_force:
10437                order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """
10438            else:
10439                order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """
10440
10441            # Transcript columns joined depend on version consideration
10442            if transcript_version_force:
10443                transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """
10444            else:
10445                transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """
10446
10447            # Query ranking for update
10448            query_update_ranking = f"""
10449                SELECT
10450                    "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)}
10451                    ROW_NUMBER() OVER (
10452                        PARTITION BY "#CHROM", POS, REF, ALT
10453                        ORDER BY {order_by}
10454                    ) AS rn
10455                FROM {transcripts_table}
10456                LEFT JOIN 
10457                    (
10458                        SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order
10459                        FROM transcripts_preference_dataframe
10460                    ) AS transcripts_preference
10461                ON {transcripts_version_join}
10462            """
10463
10464        else:
10465
10466            # Query ranking for update
10467            query_update_ranking = f"""
10468                SELECT
10469                    "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)}
10470                    ROW_NUMBER() OVER (
10471                        PARTITION BY "#CHROM", POS, REF, ALT
10472                        ORDER BY {" , ".join(query_update_order_list)}
10473                    ) AS rn
10474                FROM {transcripts_table}
10475            """
10476
10477        # Export Transcripts prioritization infos to variants table
10478        query_update = f"""
10479            WITH RankedTranscripts AS (
10480                {query_update_ranking}
10481            )
10482            UPDATE {table_variants}
10483                SET
10484                INFO = CONCAT(CASE
10485                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10486                            THEN ''
10487                            ELSE concat("INFO", ';')
10488                        END,
10489                        concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)})
10490                        )
10491            FROM
10492                RankedTranscripts
10493            WHERE
10494                rn = 1
10495                AND variants."#CHROM" = RankedTranscripts."#CHROM"
10496                AND variants."POS" = RankedTranscripts."POS"
10497                AND variants."REF" = RankedTranscripts."REF"
10498                AND variants."ALT" = RankedTranscripts."ALT"     
10499        """
10500
10501        # log.debug(f"query_update={query_update}")
10502        self.execute_query(query=query_update)
10503
10504        # Return
10505        return True

The transcripts_prioritization function prioritizes transcripts based on certain parameters and updates the variants table with the prioritized information.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table containing transcripts data. If no value is provided, it defaults to "transcripts". This parameter is used to identify the table where the transcripts data is stored for the prioritization process
  • param: The param parameter in the transcripts_prioritization method is a dictionary that contains various configuration settings for the prioritization process of transcripts. It is used to customize the behavior of the prioritization algorithm and includes settings such as the prefix for prioritization fields, default profiles, and other
Returns

The function transcripts_prioritization returns a boolean value True if the transcripts prioritization process is successfully completed, and False if there are any issues or if no profile is defined for transcripts prioritization.

def create_transcript_view_from_columns_map( self, transcripts_table: str = 'transcripts', columns_maps: dict = {}, added_columns: list = [], temporary_tables: list = None, annotation_fields: list = None, column_rename: dict = {}, column_clean: bool = False, column_case: str = None) -> tuple[list, list, list]:
10507    def create_transcript_view_from_columns_map(
10508        self,
10509        transcripts_table: str = "transcripts",
10510        columns_maps: dict = {},
10511        added_columns: list = [],
10512        temporary_tables: list = None,
10513        annotation_fields: list = None,
10514        column_rename: dict = {},
10515        column_clean: bool = False,
10516        column_case: str = None,
10517    ) -> tuple[list, list, list]:
10518        """
10519        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
10520        specified columns mapping for transcripts data.
10521
10522        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10523        of the table where the transcripts data is stored or will be stored in the database. This table
10524        typically contains information about transcripts such as Ensembl transcript IDs, gene names,
10525        scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
10526        :type transcripts_table: str (optional)
10527        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information
10528        about how to map columns from a transcripts table to create a view. Each entry in the
10529        `columns_maps` list represents a mapping configuration for a specific set of columns. It
10530        typically includes details such as the main transcript column and additional information columns
10531        :type columns_maps: dict
10532        :param added_columns: The `added_columns` parameter in the
10533        `create_transcript_view_from_columns_map` function is a list that stores the additional columns
10534        that will be added to the view being created based on the columns map provided. These columns
10535        are generated by exploding the transcript information columns along with the main transcript
10536        column
10537        :type added_columns: list
10538        :param temporary_tables: The `temporary_tables` parameter in the
10539        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
10540        tables created during the process of creating a transcript view from a columns map. These
10541        temporary tables are used to store intermediate results or transformations before the final view
10542        is generated
10543        :type temporary_tables: list
10544        :param annotation_fields: The `annotation_fields` parameter in the
10545        `create_transcript_view_from_columns_map` function is a list that stores the fields that are
10546        used for annotation in the query view creation process. These fields are extracted from the
10547        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
10548        :type annotation_fields: list
10549        :param column_rename: The `column_rename` parameter in the
10550        `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify
10551        custom renaming for columns during the creation of the temporary table view. This parameter
10552        provides a mapping of original column names to the desired renamed column names. By using this
10553        parameter,
10554        :type column_rename: dict
10555        :param column_clean: The `column_clean` parameter in the
10556        `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the
10557        column values should be cleaned or not. If set to `True`, the column values will be cleaned by
10558        removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to
10559        False
10560        :type column_clean: bool (optional)
10561        :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map`
10562        function is used to specify the case transformation to be applied to the columns during the view
10563        creation process. It allows you to control whether the column values should be converted to
10564        lowercase, uppercase, or remain unchanged
10565        :type column_case: str
10566        :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three
10567        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
10568        """
10569
10570        log.debug("Start transcrpts view creation from columns map...")
10571
10572        # "from_columns_map": [
10573        #     {
10574        #         "transcripts_column": "Ensembl_transcriptid",
10575        #         "transcripts_infos_columns": [
10576        #             "genename",
10577        #             "Ensembl_geneid",
10578        #             "LIST_S2_score",
10579        #             "LIST_S2_pred",
10580        #         ],
10581        #     },
10582        #     {
10583        #         "transcripts_column": "Ensembl_transcriptid",
10584        #         "transcripts_infos_columns": [
10585        #             "genename",
10586        #             "VARITY_R_score",
10587        #             "Aloft_pred",
10588        #         ],
10589        #     },
10590        # ],
10591
10592        # Init
10593        if temporary_tables is None:
10594            temporary_tables = []
10595        if annotation_fields is None:
10596            annotation_fields = []
10597
10598        # Variants table
10599        table_variants = self.get_table_variants()
10600
10601        for columns_map in columns_maps:
10602
10603            # Log
10604            log.debug(f"columns_map={columns_map}")
10605
10606            # Transcript column
10607            transcripts_column = columns_map.get("transcripts_column", None)
10608
10609            # Transcripts infos columns
10610            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
10611
10612            # Transcripts infos columns rename
10613            column_rename = columns_map.get("column_rename", column_rename)
10614
10615            # Transcripts infos columns clean
10616            column_clean = columns_map.get("column_clean", column_clean)
10617
10618            # Transcripts infos columns case
10619            column_case = columns_map.get("column_case", column_case)
10620
10621            if transcripts_column is not None:
10622
10623                # Explode
10624                added_columns += self.explode_infos(
10625                    fields=[transcripts_column] + transcripts_infos_columns
10626                )
10627
10628                # View clauses
10629                clause_select_variants = []
10630                clause_select_tanscripts = []
10631                for field in [transcripts_column] + transcripts_infos_columns:
10632
10633                    # AS field
10634                    as_field = field
10635
10636                    # Rename
10637                    if column_rename:
10638                        as_field = column_rename.get(as_field, as_field)
10639
10640                    # Clean
10641                    if column_clean:
10642                        as_field = clean_annotation_field(as_field)
10643
10644                    # Case
10645                    if column_case:
10646                        if column_case.lower() in ["lower"]:
10647                            as_field = as_field.lower()
10648                        elif column_case.lower() in ["upper"]:
10649                            as_field = as_field.upper()
10650
10651                    # Clause select Variants
10652                    clause_select_variants.append(
10653                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10654                    )
10655
10656                    if field in [transcripts_column]:
10657                        clause_select_tanscripts.append(
10658                            f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10659                        )
10660                    else:
10661                        clause_select_tanscripts.append(
10662                            f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """
10663                        )
10664                        annotation_fields.append(as_field)
10665
10666                # Query View
10667                query = f""" 
10668                    SELECT
10669                        "#CHROM", POS, REF, ALT, INFO,
10670                        "{transcripts_column}" AS 'transcript',
10671                        {", ".join(clause_select_tanscripts)}
10672                    FROM (
10673                        SELECT 
10674                            "#CHROM", POS, REF, ALT, INFO,
10675                            {", ".join(clause_select_variants)}
10676                        FROM {table_variants}
10677                        )
10678                    WHERE "{transcripts_column}" IS NOT NULL
10679                """
10680
10681                # Create temporary table
10682                temporary_table = transcripts_table + "".join(
10683                    random.choices(string.ascii_uppercase + string.digits, k=10)
10684                )
10685
10686                # # Temporary_tables
10687                # temporary_tables.append(temporary_table)
10688                # query_view = f"""
10689                #     CREATE TEMPORARY TABLE {temporary_table}
10690                #     AS ({query})
10691                # """
10692                # self.execute_query(query=query_view)
10693
10694                # Temporary_tables
10695                temporary_tables.append(temporary_table)
10696
10697                # List of unique #CHROM
10698                query_unique_chrom = f"""
10699                    SELECT DISTINCT "#CHROM"
10700                    FROM variants
10701                """
10702                unique_chroms = self.get_query_to_df(query=query_unique_chrom)
10703
10704                # Create table with structure but without data
10705                query_create_table = f"""
10706                    CREATE TABLE {temporary_table}
10707                    AS ({query} LIMIT 0)
10708                """
10709                self.execute_query(query=query_create_table)
10710
10711                # Process by #CHROM
10712                for chrom in unique_chroms["#CHROM"]:
10713
10714                    # Log
10715                    log.debug(f"Processing #CHROM={chrom}")
10716
10717                    # Select data by #CHROM
10718                    query_chunk = f"""
10719                        SELECT *
10720                        FROM ({query})
10721                        WHERE "#CHROM" = '{chrom}'
10722                    """
10723
10724                    # Insert data
10725                    query_insert_chunk = f"""
10726                        INSERT INTO {temporary_table}
10727                        {query_chunk}
10728                    """
10729                    self.execute_query(query=query_insert_chunk)
10730
10731        return added_columns, temporary_tables, annotation_fields

The create_transcript_view_from_columns_map function generates a temporary table view based on specified columns mapping for transcripts data.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table where the transcripts data is stored or will be stored in the database. This table typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
  • columns_maps: The columns_maps parameter is a dictionary that contains information about how to map columns from a transcripts table to create a view. Each entry in the columns_maps list represents a mapping configuration for a specific set of columns. It typically includes details such as the main transcript column and additional information columns
  • added_columns: The added_columns parameter in the create_transcript_view_from_columns_map function is a list that stores the additional columns that will be added to the view being created based on the columns map provided. These columns are generated by exploding the transcript information columns along with the main transcript column
  • temporary_tables: The temporary_tables parameter in the create_transcript_view_from_columns_map function is a list that stores the names of temporary tables created during the process of creating a transcript view from a columns map. These temporary tables are used to store intermediate results or transformations before the final view is generated
  • annotation_fields: The annotation_fields parameter in the create_transcript_view_from_columns_map function is a list that stores the fields that are used for annotation in the query view creation process. These fields are extracted from the transcripts_column and transcripts_infos_columns specified in the `columns
  • column_rename: The column_rename parameter in the create_transcript_view_from_columns_map function is a dictionary that allows you to specify custom renaming for columns during the creation of the temporary table view. This parameter provides a mapping of original column names to the desired renamed column names. By using this parameter,
  • column_clean: The column_clean parameter in the create_transcript_view_from_columns_map function is a boolean flag that determines whether the column values should be cleaned or not. If set to True, the column values will be cleaned by removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to False
  • column_case: The column_case parameter in the create_transcript_view_from_columns_map function is used to specify the case transformation to be applied to the columns during the view creation process. It allows you to control whether the column values should be converted to lowercase, uppercase, or remain unchanged
Returns

The create_transcript_view_from_columns_map function returns a tuple containing three lists: added_columns, temporary_tables, and annotation_fields.

def create_transcript_view_from_column_format( self, transcripts_table: str = 'transcripts', column_formats: dict = {}, temporary_tables: list = None, annotation_fields: list = None, column_rename: dict = {}, column_clean: bool = False, column_case: str = None) -> tuple[list, list, list]:
10733    def create_transcript_view_from_column_format(
10734        self,
10735        transcripts_table: str = "transcripts",
10736        column_formats: dict = {},
10737        temporary_tables: list = None,
10738        annotation_fields: list = None,
10739        column_rename: dict = {},
10740        column_clean: bool = False,
10741        column_case: str = None,
10742    ) -> tuple[list, list, list]:
10743        """
10744        The `create_transcript_view_from_column_format` function generates a transcript view based on
10745        specified column formats, adds additional columns and annotation fields, and returns the list of
10746        temporary tables and annotation fields.
10747
10748        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10749        of the table containing the transcripts data. This table will be used as the base table for
10750        creating the transcript view. The default value for this parameter is "transcripts", but you can
10751        provide a different table name if needed, defaults to transcripts
10752        :type transcripts_table: str (optional)
10753        :param column_formats: The `column_formats` parameter is a dictionary that contains information
10754        about the columns to be used for creating the transcript view. Each entry in the dictionary
10755        specifies the mapping between a transcripts column and a transcripts infos column. This
10756        parameter allows you to define how the columns from the transcripts table should be transformed
10757        or mapped
10758        :type column_formats: dict
10759        :param temporary_tables: The `temporary_tables` parameter in the
10760        `create_transcript_view_from_column_format` function is a list that stores the names of
10761        temporary views created during the process of creating a transcript view from a column format.
10762        These temporary views are used to manipulate and extract data before generating the final
10763        transcript view
10764        :type temporary_tables: list
10765        :param annotation_fields: The `annotation_fields` parameter in the
10766        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
10767        that are extracted from the temporary views created during the process. These annotation fields
10768        are obtained by querying the temporary views and extracting the column names excluding specific
10769        columns like `#CH
10770        :type annotation_fields: list
10771        :param column_rename: The `column_rename` parameter in the
10772        `create_transcript_view_from_column_format` function is a dictionary that allows you to specify
10773        custom renaming of columns in the transcripts infos table. By providing a mapping of original
10774        column names to new column names in this dictionary, you can rename specific columns during the
10775        process
10776        :type column_rename: dict
10777        :param column_clean: The `column_clean` parameter in the
10778        `create_transcript_view_from_column_format` function is a boolean flag that determines whether
10779        the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns
10780        will be cleaned during the creation of the transcript view based on the specified column format,
10781        defaults to False
10782        :type column_clean: bool (optional)
10783        :param column_case: The `column_case` parameter in the
10784        `create_transcript_view_from_column_format` function is used to specify the case transformation
10785        to be applied to the columns in the transcript view. It can be set to either "upper" or "lower"
10786        to convert the column names to uppercase or lowercase, respectively
10787        :type column_case: str
10788        :return: The `create_transcript_view_from_column_format` function returns two lists:
10789        `temporary_tables` and `annotation_fields`.
10790        """
10791
10792        log.debug("Start transcrpts view creation from column format...")
10793
10794        #  "from_column_format": [
10795        #     {
10796        #         "transcripts_column": "ANN",
10797        #         "transcripts_infos_column": "Feature_ID",
10798        #     }
10799        # ],
10800
10801        # Init
10802        if temporary_tables is None:
10803            temporary_tables = []
10804        if annotation_fields is None:
10805            annotation_fields = []
10806
10807        for column_format in column_formats:
10808
10809            # annotation field and transcript annotation field
10810            annotation_field = column_format.get("transcripts_column", "ANN")
10811            transcript_annotation = column_format.get(
10812                "transcripts_infos_column", "Feature_ID"
10813            )
10814
10815            # Transcripts infos columns rename
10816            column_rename = column_format.get("column_rename", column_rename)
10817
10818            # Transcripts infos columns clean
10819            column_clean = column_format.get("column_clean", column_clean)
10820
10821            # Transcripts infos columns case
10822            column_case = column_format.get("column_case", column_case)
10823
10824            # Temporary View name
10825            temporary_view_name = transcripts_table + "".join(
10826                random.choices(string.ascii_uppercase + string.digits, k=10)
10827            )
10828
10829            # Create temporary view name
10830            temporary_view_name = self.annotation_format_to_table(
10831                uniquify=True,
10832                annotation_field=annotation_field,
10833                view_name=temporary_view_name,
10834                annotation_id=transcript_annotation,
10835                column_rename=column_rename,
10836                column_clean=column_clean,
10837                column_case=column_case,
10838            )
10839
10840            # Annotation fields
10841            if temporary_view_name:
10842                query_annotation_fields = f"""
10843                    SELECT *
10844                    FROM (
10845                        DESCRIBE SELECT *
10846                        FROM {temporary_view_name}
10847                        )
10848                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
10849                """
10850                df_annotation_fields = self.get_query_to_df(
10851                    query=query_annotation_fields
10852                )
10853
10854                # Add temporary view and annotation fields
10855                temporary_tables.append(temporary_view_name)
10856                annotation_fields += list(set(df_annotation_fields["column_name"]))
10857
10858        return temporary_tables, annotation_fields

The create_transcript_view_from_column_format function generates a transcript view based on specified column formats, adds additional columns and annotation fields, and returns the list of temporary tables and annotation fields.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table containing the transcripts data. This table will be used as the base table for creating the transcript view. The default value for this parameter is "transcripts", but you can provide a different table name if needed, defaults to transcripts
  • column_formats: The column_formats parameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary specifies the mapping between a transcripts column and a transcripts infos column. This parameter allows you to define how the columns from the transcripts table should be transformed or mapped
  • temporary_tables: The temporary_tables parameter in the create_transcript_view_from_column_format function is a list that stores the names of temporary views created during the process of creating a transcript view from a column format. These temporary views are used to manipulate and extract data before generating the final transcript view
  • annotation_fields: The annotation_fields parameter in the create_transcript_view_from_column_format function is a list that stores the annotation fields that are extracted from the temporary views created during the process. These annotation fields are obtained by querying the temporary views and extracting the column names excluding specific columns like `#CH
  • column_rename: The column_rename parameter in the create_transcript_view_from_column_format function is a dictionary that allows you to specify custom renaming of columns in the transcripts infos table. By providing a mapping of original column names to new column names in this dictionary, you can rename specific columns during the process
  • column_clean: The column_clean parameter in the create_transcript_view_from_column_format function is a boolean flag that determines whether the transcripts infos columns should undergo a cleaning process. If set to True, the columns will be cleaned during the creation of the transcript view based on the specified column format, defaults to False
  • column_case: The column_case parameter in the create_transcript_view_from_column_format function is used to specify the case transformation to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" to convert the column names to uppercase or lowercase, respectively
Returns

The create_transcript_view_from_column_format function returns two lists: temporary_tables and annotation_fields.

def create_transcript_view( self, transcripts_table: str = None, transcripts_table_drop: bool = False, param: dict = {}) -> str:
10860    def create_transcript_view(
10861        self,
10862        transcripts_table: str = None,
10863        transcripts_table_drop: bool = False,
10864        param: dict = {},
10865    ) -> str:
10866        """
10867        The `create_transcript_view` function generates a transcript view by processing data from a
10868        specified table based on provided parameters and structural information.
10869
10870        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
10871        is used to specify the name of the table that will store the final transcript view data. If a table
10872        name is not provided, the function will create a new table to store the transcript view data, and by
10873        default,, defaults to transcripts
10874        :type transcripts_table: str (optional)
10875        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
10876        `create_transcript_view` function is a boolean parameter that determines whether to drop the
10877        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
10878        the function will drop the existing transcripts table if it exists, defaults to False
10879        :type transcripts_table_drop: bool (optional)
10880        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
10881        contains information needed to create a transcript view. It includes details such as the structure
10882        of the transcripts, columns mapping, column formats, and other necessary information for generating
10883        the view. This parameter allows for flexibility and customization
10884        :type param: dict
10885        :return: The `create_transcript_view` function returns the name of the transcripts table that was
10886        created or modified during the execution of the function.
10887        """
10888
10889        log.debug("Start transcripts view creation...")
10890
10891        # Default
10892        transcripts_table_default = "transcripts"
10893
10894        # Param
10895        if not param:
10896            param = self.get_param()
10897
10898        # Struct
10899        struct = param.get("transcripts", {}).get("struct", None)
10900
10901        # Transcript veresion
10902        transcript_id_remove_version = param.get("transcripts", {}).get(
10903            "transcript_id_remove_version", False
10904        )
10905
10906        # Transcripts mapping
10907        transcript_id_mapping_file = param.get("transcripts", {}).get(
10908            "transcript_id_mapping_file", None
10909        )
10910
10911        # Transcripts mapping
10912        transcript_id_mapping_force = param.get("transcripts", {}).get(
10913            "transcript_id_mapping_force", None
10914        )
10915
10916        # Transcripts table
10917        if transcripts_table is None:
10918            transcripts_table = param.get("transcripts", {}).get(
10919                "table", transcripts_table_default
10920            )
10921
10922        # Check transcripts table exists
10923        if transcripts_table:
10924
10925            # Query to check if transcripts table exists
10926            query_check_table = f"""
10927                SELECT * 
10928                FROM information_schema.tables 
10929                WHERE table_name = '{transcripts_table}'
10930            """
10931            df_check_table = self.get_query_to_df(query=query_check_table)
10932
10933            # Check if transcripts table exists
10934            if len(df_check_table) > 0 and not transcripts_table_drop:
10935                log.debug(f"Table {transcripts_table} exists and not drop option")
10936                return transcripts_table
10937
10938        if struct:
10939
10940            # added_columns
10941            added_columns = []
10942
10943            # Temporary tables
10944            temporary_tables = []
10945
10946            # Annotation fields
10947            annotation_fields = []
10948
10949            # from columns map
10950            columns_maps = struct.get("from_columns_map", [])
10951            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10952                self.create_transcript_view_from_columns_map(
10953                    transcripts_table=transcripts_table,
10954                    columns_maps=columns_maps,
10955                    added_columns=added_columns,
10956                    temporary_tables=temporary_tables,
10957                    annotation_fields=annotation_fields,
10958                )
10959            )
10960            added_columns += added_columns_tmp
10961            temporary_tables += temporary_tables_tmp
10962            annotation_fields += annotation_fields_tmp
10963
10964            # from column format
10965            column_formats = struct.get("from_column_format", [])
10966            temporary_tables_tmp, annotation_fields_tmp = (
10967                self.create_transcript_view_from_column_format(
10968                    transcripts_table=transcripts_table,
10969                    column_formats=column_formats,
10970                    temporary_tables=temporary_tables,
10971                    annotation_fields=annotation_fields,
10972                )
10973            )
10974            temporary_tables += temporary_tables_tmp
10975            annotation_fields += annotation_fields_tmp
10976
10977            # Remove some specific fields/column
10978            annotation_fields = list(set(annotation_fields))
10979            for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]:
10980                if field in annotation_fields:
10981                    annotation_fields.remove(field)
10982
10983            # Merge temporary tables query
10984            query_merge = ""
10985            for temporary_table in list(set(temporary_tables)):
10986
10987                # First temporary table
10988                if not query_merge:
10989                    query_merge = f"""
10990                        SELECT * FROM {temporary_table}
10991                    """
10992                # other temporary table (using UNION)
10993                else:
10994                    query_merge += f"""
10995                        UNION BY NAME SELECT * FROM {temporary_table}
10996                    """
10997
10998            # transcript table tmp
10999            transcript_table_tmp = "transcripts_tmp"
11000            transcript_table_tmp2 = "transcripts_tmp2"
11001            transcript_table_tmp3 = "transcripts_tmp3"
11002
11003            # Merge on transcript
11004            query_merge_on_transcripts_annotation_fields = []
11005
11006            # Add transcript list
11007            query_merge_on_transcripts_annotation_fields.append(
11008                f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """
11009            )
11010
11011            # Aggregate all annotations fields
11012            for annotation_field in set(annotation_fields):
11013                query_merge_on_transcripts_annotation_fields.append(
11014                    f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """
11015                )
11016
11017            # Transcripts mapping
11018            if transcript_id_mapping_file:
11019
11020                # Transcript dataframe
11021                transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe"
11022                transcript_id_mapping_dataframe = transcripts_file_to_df(
11023                    transcript_id_mapping_file, column_names=["transcript", "alias"]
11024                )
11025
11026                # Transcript version remove
11027                if transcript_id_remove_version:
11028                    query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped"
11029                    query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)"
11030                    query_left_join = f"""
11031                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
11032                    """
11033                else:
11034                    query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped"
11035                    query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript"
11036                    query_left_join = f"""
11037                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
11038                    """
11039
11040                # Transcript column for group by merge
11041                query_transcript_merge_group_by = """
11042                        CASE
11043                            WHEN transcript_mapped NOT IN ('')
11044                            THEN split_part(transcript_mapped, '.', 1)
11045                            ELSE split_part(transcript_original, '.', 1)
11046                        END
11047                    """
11048
11049                # Merge query
11050                transcripts_tmp2_query = f"""
11051                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)}
11052                    FROM ({query_merge}) AS {transcript_table_tmp}
11053                    {query_left_join}
11054                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by}
11055                """
11056
11057                # Retrive columns after mege
11058                transcripts_tmp2_describe_query = f"""
11059                    DESCRIBE {transcripts_tmp2_query}
11060                """
11061                transcripts_tmp2_describe_list = list(
11062                    self.get_query_to_df(query=transcripts_tmp2_describe_query)[
11063                        "column_name"
11064                    ]
11065                )
11066
11067                # Create list of columns for select clause
11068                transcripts_tmp2_describe_select_clause = []
11069                for field in transcripts_tmp2_describe_list:
11070                    if field not in [
11071                        "#CHROM",
11072                        "POS",
11073                        "REF",
11074                        "ALT",
11075                        "INFO",
11076                        "transcript_mapped",
11077                    ]:
11078                        as_field = field
11079                        if field in ["transcript_original"]:
11080                            as_field = "transcripts_mapped"
11081                        transcripts_tmp2_describe_select_clause.append(
11082                            f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """
11083                        )
11084
11085                # Merge with mapping
11086                query_merge_on_transcripts = f"""
11087                    SELECT
11088                        "#CHROM", POS, REF, ALT, INFO,
11089                        CASE
11090                            WHEN ANY_VALUE(transcript_mapped) NOT IN ('')
11091                            THEN ANY_VALUE(transcript_mapped)
11092                            ELSE ANY_VALUE(transcript_original)
11093                        END AS transcript,
11094                        {", ".join(transcripts_tmp2_describe_select_clause)}
11095                    FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2}
11096                    GROUP BY "#CHROM", POS, REF, ALT, INFO,
11097                        {query_transcript_merge_group_by}
11098                """
11099
11100                # Add transcript filter from mapping file
11101                if transcript_id_mapping_force:
11102                    query_merge_on_transcripts = f"""
11103                        SELECT *
11104                        FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3}
11105                        WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe)
11106                    """
11107
11108            # No transcript mapping
11109            else:
11110
11111                # Remove transcript version
11112                if transcript_id_remove_version:
11113                    query_transcript_column = f"""
11114                        split_part({transcript_table_tmp}.transcript, '.', 1)
11115                    """
11116                else:
11117                    query_transcript_column = """
11118                        transcript
11119                    """
11120
11121                # Query sections
11122                query_transcript_column_select = (
11123                    f"{query_transcript_column} AS transcript"
11124                )
11125                query_transcript_column_group_by = query_transcript_column
11126
11127                # Query for transcripts view
11128                query_merge_on_transcripts = f"""
11129                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)}
11130                    FROM ({query_merge}) AS {transcript_table_tmp}
11131                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column}
11132                """
11133
11134            # Drop transcript view is necessary
11135            if transcripts_table_drop:
11136                query_drop = f"""
11137                    DROP TABLE IF EXISTS {transcripts_table};
11138                """
11139                self.execute_query(query=query_drop)
11140
11141            # # Merge and create transcript view
11142            # query_create_view = f"""
11143            #     CREATE TABLE IF NOT EXISTS {transcripts_table}
11144            #     AS {query_merge_on_transcripts}
11145            # """
11146            # self.execute_query(query=query_create_view)
11147
11148            # Using #CHROM chunk
11149            ######
11150
11151            # List of unique #CHROM
11152            query_unique_chrom = f"""
11153                SELECT DISTINCT "#CHROM"
11154                FROM variants AS subquery
11155            """
11156            unique_chroms = self.get_query_to_df(query=query_unique_chrom)
11157
11158            # Create table with structure but without data, if not exists
11159            query_create_table = f"""
11160                CREATE TABLE IF NOT EXISTS {transcripts_table} AS
11161                SELECT * FROM ({query_merge_on_transcripts}) AS subquery LIMIT 0
11162            """
11163            self.execute_query(query=query_create_table)
11164
11165            # Process by #CHROM
11166            for chrom in unique_chroms["#CHROM"]:
11167
11168                # Log
11169                log.debug(f"Processing #CHROM={chrom}")
11170
11171                # Select data by #CHROM
11172                query_chunk = f"""
11173                    SELECT *
11174                    FROM ({query_merge_on_transcripts})
11175                    WHERE "#CHROM" = '{chrom}'
11176                """
11177
11178                # Insert data
11179                query_insert_chunk = f"""
11180                    INSERT INTO {transcripts_table}
11181                    {query_chunk}
11182                """
11183                self.execute_query(query=query_insert_chunk)
11184
11185            # Remove temporary tables
11186            if temporary_tables:
11187                for temporary_table in list(set(temporary_tables)):
11188                    query_drop_tmp_table = f"""
11189                        DROP TABLE IF EXISTS {temporary_table}
11190                    """
11191                    self.execute_query(query=query_drop_tmp_table)
11192
11193            # Remove added columns
11194            for added_column in added_columns:
11195                self.drop_column(column=added_column)
11196
11197        else:
11198
11199            transcripts_table = None
11200
11201        return transcripts_table

The create_transcript_view function generates a transcript view by processing data from a specified table based on provided parameters and structural information.

Parameters
  • transcripts_table: The transcripts_table parameter in the create_transcript_view function is used to specify the name of the table that will store the final transcript view data. If a table name is not provided, the function will create a new table to store the transcript view data, and by default,, defaults to transcripts
  • transcripts_table_drop: The transcripts_table_drop parameter in the create_transcript_view function is a boolean parameter that determines whether to drop the existing transcripts table before creating a new one. If transcripts_table_drop is set to True, the function will drop the existing transcripts table if it exists, defaults to False
  • param: The param parameter in the create_transcript_view function is a dictionary that contains information needed to create a transcript view. It includes details such as the structure of the transcripts, columns mapping, column formats, and other necessary information for generating the view. This parameter allows for flexibility and customization
Returns

The create_transcript_view function returns the name of the transcripts table that was created or modified during the execution of the function.

def annotation_format_to_table( self, uniquify: bool = True, annotation_field: str = 'ANN', annotation_id: str = 'Feature_ID', view_name: str = 'transcripts', column_rename: dict = {}, column_clean: bool = False, column_case: str = None) -> str:
11203    def annotation_format_to_table(
11204        self,
11205        uniquify: bool = True,
11206        annotation_field: str = "ANN",
11207        annotation_id: str = "Feature_ID",
11208        view_name: str = "transcripts",
11209        column_rename: dict = {},
11210        column_clean: bool = False,
11211        column_case: str = None,
11212    ) -> str:
11213        """
11214        The `annotation_format_to_table` function converts annotation data from a VCF file into a
11215        structured table format, ensuring unique values and creating a temporary table for further
11216        processing or analysis.
11217
11218        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure
11219        unique values in the output or not. If set to `True`, the function will make sure that the
11220        output values are unique, defaults to True
11221        :type uniquify: bool (optional)
11222        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file
11223        that contains the annotation information for each variant. This field is used to extract the
11224        annotation details for further processing in the function. By default, it is set to "ANN",
11225        defaults to ANN
11226        :type annotation_field: str (optional)
11227        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method
11228        is used to specify the identifier for the annotation feature. This identifier will be used as a
11229        column name in the resulting table or view that is created based on the annotation data. It
11230        helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
11231        :type annotation_id: str (optional)
11232        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used
11233        to specify the name of the temporary table that will be created to store the transformed
11234        annotation data. This table will hold the extracted information from the annotation field in a
11235        structured format for further processing or analysis. By default,, defaults to transcripts
11236        :type view_name: str (optional)
11237        :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method
11238        is a dictionary that allows you to specify custom renaming for columns. By providing key-value
11239        pairs in this dictionary, you can rename specific columns in the resulting table or view that is
11240        created based on the annotation data. This feature enables
11241        :type column_rename: dict
11242        :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is
11243        a boolean flag that determines whether the annotation field should undergo a cleaning process.
11244        If set to `True`, the function will clean the annotation field before further processing. This
11245        cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults
11246        to False
11247        :type column_clean: bool (optional)
11248        :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is
11249        used to specify the case transformation to be applied to the column names extracted from the
11250        annotation data. It allows you to set the case of the column names to either lowercase or
11251        uppercase for consistency or other specific requirements during the conversion
11252        :type column_case: str
11253        :return: The function `annotation_format_to_table` is returning the name of the view created,
11254        which is stored in the variable `view_name`.
11255        """
11256
11257        # Annotation field
11258        annotation_format = "annotation_explode"
11259
11260        # Transcript annotation
11261        if column_rename:
11262            annotation_id = column_rename.get(annotation_id, annotation_id)
11263
11264        if column_clean:
11265            annotation_id = clean_annotation_field(annotation_id)
11266
11267        # Prefix
11268        prefix = self.get_explode_infos_prefix()
11269        if prefix:
11270            prefix = "INFO/"
11271
11272        # Annotation fields
11273        annotation_infos = prefix + annotation_field
11274        annotation_format_infos = prefix + annotation_format
11275
11276        # Variants table
11277        table_variants = self.get_table_variants()
11278
11279        # Header
11280        vcf_reader = self.get_header()
11281
11282        # Add columns
11283        added_columns = []
11284
11285        # Explode HGVS field in column
11286        added_columns += self.explode_infos(fields=[annotation_field])
11287
11288        if annotation_field in vcf_reader.infos:
11289
11290            # Extract ANN header
11291            ann_description = vcf_reader.infos[annotation_field].desc
11292            pattern = r"'(.+?)'"
11293            match = re.search(pattern, ann_description)
11294            if match:
11295                ann_header_match = match.group(1).split(" | ")
11296                ann_header = []
11297                ann_header_desc = {}
11298                for i in range(len(ann_header_match)):
11299                    ann_header_info = "".join(
11300                        char for char in ann_header_match[i] if char.isalnum()
11301                    )
11302                    ann_header.append(ann_header_info)
11303                    ann_header_desc[ann_header_info] = ann_header_match[i]
11304                if not ann_header_desc:
11305                    raise ValueError("Invalid header description format")
11306            else:
11307                raise ValueError("Invalid header description format")
11308
11309            # Create variant id
11310            variant_id_column = self.get_variant_id_column()
11311            added_columns += [variant_id_column]
11312
11313            # Get list of #CHROM
11314            query_unique_chrom = f"""
11315                SELECT DISTINCT "#CHROM"
11316                FROM variants AS subquery
11317            """
11318            unique_chroms = self.get_query_to_df(query=query_unique_chrom)
11319
11320            # Base for database anontation format
11321            dataframe_annotation_format_base = f"""
11322                SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}"
11323                FROM {table_variants}
11324            """
11325
11326            # Create dataframe for keys column type
11327            dataframe_annotation_format = self.get_query_to_df(
11328                f""" {dataframe_annotation_format_base} LIMIT 1000 """
11329            )
11330
11331            # Define a vectorized function to apply explode_annotation_format
11332            vectorized_explode_annotation_format = np.vectorize(
11333                lambda x: explode_annotation_format(
11334                    annotation=str(x),
11335                    uniquify=uniquify,
11336                    output_format="JSON",
11337                    prefix="",
11338                    header=list(ann_header_desc.values()),
11339                )
11340            )
11341
11342            # Assign the exploded annotations back to the dataframe
11343            dataframe_annotation_format[annotation_format_infos] = (
11344                vectorized_explode_annotation_format(
11345                    dataframe_annotation_format[annotation_infos].to_numpy()
11346                )
11347            )
11348
11349            # Find keys
11350            query_json = f"""
11351                SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key'
11352                FROM dataframe_annotation_format;
11353            """
11354            df_keys = self.get_query_to_df(query=query_json)
11355
11356            # Check keys
11357            query_json_key = []
11358            for _, row in df_keys.iterrows():
11359
11360                # Key
11361                key = row.iloc[0]
11362                key_clean = key
11363
11364                # key rename
11365                if column_rename:
11366                    key_clean = column_rename.get(key_clean, key_clean)
11367
11368                # key clean
11369                if column_clean:
11370                    key_clean = clean_annotation_field(key_clean)
11371
11372                # Key case
11373                if column_case:
11374                    if column_case.lower() in ["lower"]:
11375                        key_clean = key_clean.lower()
11376                    elif column_case.lower() in ["upper"]:
11377                        key_clean = key_clean.upper()
11378
11379                # Type
11380                query_json_type = f"""
11381                    SELECT * 
11382                    FROM (
11383                        SELECT 
11384                            NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '') AS '{key_clean}'
11385                        FROM
11386                            dataframe_annotation_format
11387                        )
11388                    WHERE "{key_clean}" NOT NULL AND "{key_clean}" NOT IN ('')
11389                """
11390
11391                # Get DataFrame from query
11392                df_json_type = self.get_query_to_df(query=query_json_type)
11393
11394                # Detect column type
11395                column_type = detect_column_type(df_json_type[key_clean])
11396
11397                # Free up memory
11398                del df_json_type
11399
11400                # Append
11401                query_json_key.append(
11402                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
11403                )
11404
11405            # Create table with structure but without data, if not exists
11406            query_create_table = f"""
11407                CREATE TABLE IF NOT EXISTS {view_name}
11408                AS (
11409                    SELECT *, {annotation_id} AS 'transcript'
11410                    FROM (
11411                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
11412                        FROM dataframe_annotation_format
11413                        )
11414                    LIMIT 0
11415                    );
11416            """
11417            self.execute_query(query=query_create_table)
11418
11419            # Free up memory
11420            del dataframe_annotation_format
11421
11422            # Insert data by chromosome
11423            for chrom in unique_chroms["#CHROM"]:
11424
11425                # Log
11426                log.debug(f"Processing #CHROM={chrom}")
11427
11428                # Create dataframe
11429                dataframe_annotation_format = self.get_query_to_df(
11430                    f""" {dataframe_annotation_format_base}  WHERE "#CHROM" = '{chrom}' """
11431                )
11432
11433                # Define a vectorized function to apply explode_annotation_format
11434                vectorized_explode_annotation_format = np.vectorize(
11435                    lambda x: explode_annotation_format(
11436                        annotation=str(x),
11437                        uniquify=uniquify,
11438                        output_format="JSON",
11439                        prefix="",
11440                        header=list(ann_header_desc.values()),
11441                    )
11442                )
11443
11444                # Assign the exploded annotations back to the dataframe
11445                dataframe_annotation_format[annotation_format_infos] = (
11446                    vectorized_explode_annotation_format(
11447                        dataframe_annotation_format[annotation_infos].to_numpy()
11448                    )
11449                )
11450
11451                # Insert data into tmp table
11452                query_insert_chunk = f"""
11453                    INSERT INTO {view_name}
11454                    SELECT *, {annotation_id} AS 'transcript'
11455                    FROM (
11456                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
11457                        FROM dataframe_annotation_format
11458                        )
11459                """
11460                self.execute_query(query=query_insert_chunk)
11461
11462                # Free up memory
11463                del dataframe_annotation_format
11464
11465        else:
11466
11467            # Return None
11468            view_name = None
11469
11470        # Remove added columns
11471        for added_column in added_columns:
11472            self.drop_column(column=added_column)
11473
11474        return view_name

The annotation_format_to_table function converts annotation data from a VCF file into a structured table format, ensuring unique values and creating a temporary table for further processing or analysis.

Parameters
  • uniquify: The uniquify parameter is a boolean flag that determines whether to ensure unique values in the output or not. If set to True, the function will make sure that the output values are unique, defaults to True
  • annotation_field: The annotation_field parameter refers to the field in the VCF file that contains the annotation information for each variant. This field is used to extract the annotation details for further processing in the function. By default, it is set to "ANN", defaults to ANN
  • annotation_id: The annotation_id parameter in the annotation_format_to_table method is used to specify the identifier for the annotation feature. This identifier will be used as a column name in the resulting table or view that is created based on the annotation data. It helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
  • view_name: The view_name parameter in the annotation_format_to_table method is used to specify the name of the temporary table that will be created to store the transformed annotation data. This table will hold the extracted information from the annotation field in a structured format for further processing or analysis. By default,, defaults to transcripts
  • column_rename: The column_rename parameter in the annotation_format_to_table method is a dictionary that allows you to specify custom renaming for columns. By providing key-value pairs in this dictionary, you can rename specific columns in the resulting table or view that is created based on the annotation data. This feature enables
  • column_clean: The column_clean parameter in the annotation_format_to_table method is a boolean flag that determines whether the annotation field should undergo a cleaning process. If set to True, the function will clean the annotation field before further processing. This cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults to False
  • column_case: The column_case parameter in the annotation_format_to_table method is used to specify the case transformation to be applied to the column names extracted from the annotation data. It allows you to set the case of the column names to either lowercase or uppercase for consistency or other specific requirements during the conversion
Returns

The function annotation_format_to_table is returning the name of the view created, which is stored in the variable view_name.

def transcript_view_to_variants( self, transcripts_table: str = None, transcripts_column_id: str = None, transcripts_info_json: str = None, transcripts_info_field_json: str = None, transcripts_info_format: str = None, transcripts_info_field_format: str = None, param: dict = {}) -> bool:
11476    def transcript_view_to_variants(
11477        self,
11478        transcripts_table: str = None,
11479        transcripts_column_id: str = None,
11480        transcripts_info_json: str = None,
11481        transcripts_info_field_json: str = None,
11482        transcripts_info_format: str = None,
11483        transcripts_info_field_format: str = None,
11484        param: dict = {},
11485    ) -> bool:
11486        """
11487        The `transcript_view_to_variants` function updates a variants table with information from
11488        transcripts in JSON format.
11489
11490        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
11491        table containing the transcripts data. If this parameter is not provided, the function will
11492        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
11493        :type transcripts_table: str
11494        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
11495        column in the `transcripts_table` that contains the unique identifier for each transcript. This
11496        identifier is used to match transcripts with variants in the database
11497        :type transcripts_column_id: str
11498        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
11499        of the column in the variants table where the transcripts information will be stored in JSON
11500        format. This parameter allows you to define the column in the variants table that will hold the
11501        JSON-formatted information about transcripts
11502        :type transcripts_info_json: str
11503        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
11504        specify the field in the VCF header that will contain information about transcripts in JSON
11505        format. This field will be added to the VCF header as an INFO field with the specified name
11506        :type transcripts_info_field_json: str
11507        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
11508        format of the information about transcripts that will be stored in the variants table. This
11509        format can be used to define how the transcript information will be structured or displayed
11510        within the variants table
11511        :type transcripts_info_format: str
11512        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
11513        specify the field in the VCF header that will contain information about transcripts in a
11514        specific format. This field will be added to the VCF header as an INFO field with the specified
11515        name
11516        :type transcripts_info_field_format: str
11517        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
11518        that contains various configuration settings related to transcripts. It is used to provide
11519        default values for certain parameters if they are not explicitly provided when calling the
11520        method. The `param` dictionary can be passed as an argument
11521        :type param: dict
11522        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
11523        if the operation is successful and `False` if certain conditions are not met.
11524        """
11525
11526        msg_info_prefix = "Start transcripts view to variants annotations"
11527
11528        log.debug(f"{msg_info_prefix}...")
11529
11530        # Default
11531        transcripts_table_default = "transcripts"
11532        transcripts_column_id_default = "transcript"
11533        transcripts_info_json_default = None
11534        transcripts_info_format_default = None
11535        transcripts_info_field_json_default = None
11536        transcripts_info_field_format_default = None
11537
11538        # Param
11539        if not param:
11540            param = self.get_param()
11541
11542        # Transcripts table
11543        if transcripts_table is None:
11544            transcripts_table = param.get("transcripts", {}).get(
11545                "table", transcripts_table_default
11546            )
11547
11548        # Transcripts column ID
11549        if transcripts_column_id is None:
11550            transcripts_column_id = param.get("transcripts", {}).get(
11551                "column_id", transcripts_column_id_default
11552            )
11553
11554        # Transcripts info json
11555        if transcripts_info_json is None:
11556            transcripts_info_json = param.get("transcripts", {}).get(
11557                "transcripts_info_json", transcripts_info_json_default
11558            )
11559
11560        # Transcripts info field JSON
11561        if transcripts_info_field_json is None:
11562            transcripts_info_field_json = param.get("transcripts", {}).get(
11563                "transcripts_info_field_json", transcripts_info_field_json_default
11564            )
11565        # if transcripts_info_field_json is not None and transcripts_info_json is None:
11566        #     transcripts_info_json = transcripts_info_field_json
11567
11568        # Transcripts info format
11569        if transcripts_info_format is None:
11570            transcripts_info_format = param.get("transcripts", {}).get(
11571                "transcripts_info_format", transcripts_info_format_default
11572            )
11573
11574        # Transcripts info field FORMAT
11575        if transcripts_info_field_format is None:
11576            transcripts_info_field_format = param.get("transcripts", {}).get(
11577                "transcripts_info_field_format", transcripts_info_field_format_default
11578            )
11579        # if (
11580        #     transcripts_info_field_format is not None
11581        #     and transcripts_info_format is None
11582        # ):
11583        #     transcripts_info_format = transcripts_info_field_format
11584
11585        # Variants table
11586        table_variants = self.get_table_variants()
11587
11588        # Check info columns param
11589        if (
11590            transcripts_info_json is None
11591            and transcripts_info_field_json is None
11592            and transcripts_info_format is None
11593            and transcripts_info_field_format is None
11594        ):
11595            return False
11596
11597        # Transcripts infos columns
11598        query_transcripts_infos_columns = f"""
11599            SELECT *
11600            FROM (
11601                DESCRIBE SELECT * FROM {transcripts_table}
11602                )
11603            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
11604        """
11605        transcripts_infos_columns = list(
11606            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
11607        )
11608
11609        # View results
11610        clause_select = []
11611        clause_to_json = []
11612        clause_to_format = []
11613        for field in transcripts_infos_columns:
11614            # Do not consider INFO field for export into fields
11615            if field not in ["INFO"]:
11616                clause_select.append(
11617                    f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """
11618                )
11619                clause_to_json.append(f""" '{field}': "{field}" """)
11620                clause_to_format.append(f""" "{field}" """)
11621
11622        # Update
11623        update_set_json = []
11624        update_set_format = []
11625
11626        # VCF header
11627        vcf_reader = self.get_header()
11628
11629        # Transcripts to info column in JSON
11630        if transcripts_info_json:
11631
11632            # Create column on variants table
11633            self.add_column(
11634                table_name=table_variants,
11635                column_name=transcripts_info_json,
11636                column_type="JSON",
11637                default_value=None,
11638                drop=False,
11639            )
11640
11641            # Add header
11642            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
11643                transcripts_info_json,
11644                ".",
11645                "String",
11646                "Transcripts in JSON format",
11647                "unknwon",
11648                "unknwon",
11649                self.code_type_map["String"],
11650            )
11651
11652            # Add to update
11653            update_set_json.append(
11654                f""" {transcripts_info_json}=t.{transcripts_info_json} """
11655            )
11656
11657        # Transcripts to info field in JSON
11658        if transcripts_info_field_json:
11659
11660            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
11661
11662            # Add to update
11663            update_set_json.append(
11664                f""" 
11665                    INFO = concat(
11666                            CASE
11667                                WHEN INFO NOT IN ('', '.')
11668                                THEN INFO
11669                                ELSE ''
11670                            END,
11671                            CASE
11672                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
11673                                THEN concat(
11674                                    ';{transcripts_info_field_json}=',
11675                                    t.{transcripts_info_json}
11676                                )
11677                                ELSE ''
11678                            END
11679                            )
11680                """
11681            )
11682
11683            # Add header
11684            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
11685                transcripts_info_field_json,
11686                ".",
11687                "String",
11688                "Transcripts in JSON format",
11689                "unknwon",
11690                "unknwon",
11691                self.code_type_map["String"],
11692            )
11693
11694        if update_set_json:
11695
11696            # Update query
11697            query_update = f"""
11698                UPDATE {table_variants}
11699                    SET {", ".join(update_set_json)}
11700                FROM
11701                (
11702                    SELECT
11703                        "#CHROM", POS, REF, ALT,
11704                            concat(
11705                            '{{',
11706                            string_agg(
11707                                '"' || "{transcripts_column_id}" || '":' ||
11708                                to_json(json_output)
11709                            ),
11710                            '}}'
11711                            )::JSON AS {transcripts_info_json}
11712                    FROM
11713                        (
11714                        SELECT
11715                            "#CHROM", POS, REF, ALT,
11716                            "{transcripts_column_id}",
11717                            to_json(
11718                                {{{",".join(clause_to_json)}}}
11719                            )::JSON AS json_output
11720                        FROM
11721                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11722                        WHERE "{transcripts_column_id}" IS NOT NULL
11723                        )
11724                    GROUP BY "#CHROM", POS, REF, ALT
11725                ) AS t
11726                WHERE {table_variants}."#CHROM" = t."#CHROM"
11727                    AND {table_variants}."POS" = t."POS"
11728                    AND {table_variants}."REF" = t."REF"
11729                    AND {table_variants}."ALT" = t."ALT"
11730            """
11731
11732            self.execute_query(query=query_update)
11733
11734        # Transcripts to info column in FORMAT
11735        if transcripts_info_format:
11736
11737            # Create column on variants table
11738            self.add_column(
11739                table_name=table_variants,
11740                column_name=transcripts_info_format,
11741                column_type="VARCHAR",
11742                default_value=None,
11743                drop=False,
11744            )
11745
11746            # Add header
11747            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
11748                transcripts_info_format,
11749                ".",
11750                "String",
11751                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11752                "unknwon",
11753                "unknwon",
11754                self.code_type_map["String"],
11755            )
11756
11757            # Add to update
11758            update_set_format.append(
11759                f""" {transcripts_info_format}=t.{transcripts_info_format} """
11760            )
11761
11762        else:
11763
11764            # Set variable for internal queries
11765            transcripts_info_format = "transcripts_info_format"
11766
11767        # Transcripts to info field in JSON
11768        if transcripts_info_field_format:
11769
11770            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
11771
11772            # Add to update
11773            update_set_format.append(
11774                f""" 
11775                    INFO = concat(
11776                            CASE
11777                                WHEN INFO NOT IN ('', '.')
11778                                THEN INFO
11779                                ELSE ''
11780                            END,
11781                            CASE
11782                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
11783                                THEN concat(
11784                                    ';{transcripts_info_field_format}=',
11785                                    t.{transcripts_info_format}
11786                                )
11787                                ELSE ''
11788                            END
11789                            )
11790                """
11791            )
11792
11793            # Add header
11794            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
11795                transcripts_info_field_format,
11796                ".",
11797                "String",
11798                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11799                "unknwon",
11800                "unknwon",
11801                self.code_type_map["String"],
11802            )
11803
11804        if update_set_format:
11805
11806            # Update query
11807            query_update = f"""
11808                UPDATE {table_variants}
11809                    SET {", ".join(update_set_format)}
11810                FROM
11811                (
11812                    SELECT
11813                        "#CHROM", POS, REF, ALT,
11814                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
11815                    FROM 
11816                        (
11817                        SELECT
11818                            "#CHROM", POS, REF, ALT,
11819                            "{transcripts_column_id}",
11820                            concat(
11821                                "{transcripts_column_id}",
11822                                '|',
11823                                {", '|', ".join(clause_to_format)}
11824                            ) AS {transcripts_info_format}
11825                        FROM
11826                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11827                        )
11828                    GROUP BY "#CHROM", POS, REF, ALT
11829                ) AS t
11830                WHERE {table_variants}."#CHROM" = t."#CHROM"
11831                    AND {table_variants}."POS" = t."POS"
11832                    AND {table_variants}."REF" = t."REF"
11833                    AND {table_variants}."ALT" = t."ALT"
11834            """
11835
11836            self.execute_query(query=query_update)
11837
11838        return True

The transcript_view_to_variants function updates a variants table with information from transcripts in JSON format.

Parameters
  • transcripts_table: The transcripts_table parameter is used to specify the name of the table containing the transcripts data. If this parameter is not provided, the function will attempt to retrieve it from the param dictionary or use a default value of "transcripts"
  • transcripts_column_id: The transcripts_column_id parameter is used to specify the column in the transcripts_table that contains the unique identifier for each transcript. This identifier is used to match transcripts with variants in the database
  • transcripts_info_json: The transcripts_info_json parameter is used to specify the name of the column in the variants table where the transcripts information will be stored in JSON format. This parameter allows you to define the column in the variants table that will hold the JSON-formatted information about transcripts
  • transcripts_info_field_json: The transcripts_info_field_json parameter is used to specify the field in the VCF header that will contain information about transcripts in JSON format. This field will be added to the VCF header as an INFO field with the specified name
  • transcripts_info_format: The transcripts_info_format parameter is used to specify the format of the information about transcripts that will be stored in the variants table. This format can be used to define how the transcript information will be structured or displayed within the variants table
  • transcripts_info_field_format: The transcripts_info_field_format parameter is used to specify the field in the VCF header that will contain information about transcripts in a specific format. This field will be added to the VCF header as an INFO field with the specified name
  • param: The param parameter in the transcript_view_to_variants method is a dictionary that contains various configuration settings related to transcripts. It is used to provide default values for certain parameters if they are not explicitly provided when calling the method. The param dictionary can be passed as an argument
Returns

The function transcript_view_to_variants returns a boolean value. It returns True if the operation is successful and False if certain conditions are not met.

def rename_info_fields(self, fields_to_rename: dict = None, table: str = None) -> dict:
11840    def rename_info_fields(
11841        self, fields_to_rename: dict = None, table: str = None
11842    ) -> dict:
11843        """
11844        The `rename_info_fields` function renames specified fields in a VCF file header and updates
11845        corresponding INFO fields in the variants table.
11846
11847        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the
11848        mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary
11849        represent the original field names that need to be renamed, and the corresponding values
11850        represent the new names to which the fields should be
11851        :type fields_to_rename: dict
11852        :param table: The `table` parameter in the `rename_info_fields` function represents the name of
11853        the table in which the variants data is stored. This table contains information about genetic
11854        variants, and the function updates the corresponding INFO fields in this table when renaming
11855        specified fields in the VCF file header
11856        :type table: str
11857        :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains
11858        the original field names as keys and their corresponding new names (or None if the field was
11859        removed) as values after renaming or removing specified fields in a VCF file header and updating
11860        corresponding INFO fields in the variants table.
11861        """
11862
11863        # Init
11864        fields_renamed = {}
11865        config = self.get_config()
11866        access = config.get("access")
11867
11868        if table is None:
11869            table = self.get_table_variants()
11870
11871        # regexp replace fonction
11872        regex_replace_dict = {}
11873        regex_replace_nb = 0
11874        regex_replace_partition = 125
11875        regex_replace = "concat(INFO, ';')"  # Add ';' to reduce regexp comlexity
11876
11877        if fields_to_rename is not None and access not in ["RO"]:
11878
11879            log.info("Rename or remove fields...")
11880
11881            # Header
11882            header = self.get_header()
11883
11884            for field_to_rename, field_renamed in fields_to_rename.items():
11885
11886                if field_to_rename in header.infos:
11887
11888                    # Rename header
11889                    if field_renamed is not None:
11890                        header.infos[field_renamed] = vcf.parser._Info(
11891                            field_renamed,
11892                            header.infos[field_to_rename].num,
11893                            header.infos[field_to_rename].type,
11894                            header.infos[field_to_rename].desc,
11895                            header.infos[field_to_rename].source,
11896                            header.infos[field_to_rename].version,
11897                            header.infos[field_to_rename].type_code,
11898                        )
11899                    del header.infos[field_to_rename]
11900
11901                    # Rename INFO patterns
11902                    field_pattern = rf"(^|;)({field_to_rename})(=[^;]*)?;"
11903                    if field_renamed is not None:
11904                        field_renamed_pattern = rf"\1{field_renamed}\3;"
11905                    else:
11906                        field_renamed_pattern = r"\1"
11907
11908                    # regexp replace
11909                    regex_replace_nb += 1
11910                    regex_replace_key = math.floor(
11911                        regex_replace_nb / regex_replace_partition
11912                    )
11913                    if (regex_replace_nb % regex_replace_partition) == 0:
11914                        regex_replace = "concat(INFO, ';')"
11915                    regex_replace = f"regexp_replace({regex_replace}, '{field_pattern}', '{field_renamed_pattern}')"
11916                    regex_replace_dict[regex_replace_key] = regex_replace
11917
11918                    # Return
11919                    fields_renamed[field_to_rename] = field_renamed
11920
11921                    # Log
11922                    if field_renamed is not None:
11923                        log.info(
11924                            f"Rename or remove fields - field '{field_to_rename}' renamed to '{field_renamed}'"
11925                        )
11926                    else:
11927                        log.info(
11928                            f"Rename or remove fields - field '{field_to_rename}' removed"
11929                        )
11930
11931                else:
11932
11933                    log.warning(
11934                        f"Rename or remove fields - field '{field_to_rename}' not in header"
11935                    )
11936
11937            # Rename INFO
11938            for regex_replace_key, regex_replace in regex_replace_dict.items():
11939                log.info(
11940                    f"Rename or remove fields - Process [{regex_replace_key+1}/{len(regex_replace_dict)}]..."
11941                )
11942                query = f"""
11943                    UPDATE {table}
11944                    SET
11945                        INFO = regexp_replace({regex_replace}, ';$', '')
11946                """
11947                log.debug(f"query={query}")
11948                self.execute_query(query=query)
11949
11950        return fields_renamed

The rename_info_fields function renames specified fields in a VCF file header and updates corresponding INFO fields in the variants table.

Parameters
  • fields_to_rename: The fields_to_rename parameter is a dictionary that contains the mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary represent the original field names that need to be renamed, and the corresponding values represent the new names to which the fields should be
  • table: The table parameter in the rename_info_fields function represents the name of the table in which the variants data is stored. This table contains information about genetic variants, and the function updates the corresponding INFO fields in this table when renaming specified fields in the VCF file header
Returns

The rename_info_fields function returns a dictionary fields_renamed that contains the original field names as keys and their corresponding new names (or None if the field was removed) as values after renaming or removing specified fields in a VCF file header and updating corresponding INFO fields in the variants table.

def calculation_rename_info_fields( self, fields_to_rename: dict = None, table: str = None, operation_name: str = 'RENAME_INFO_FIELDS') -> None:
11952    def calculation_rename_info_fields(
11953        self,
11954        fields_to_rename: dict = None,
11955        table: str = None,
11956        operation_name: str = "RENAME_INFO_FIELDS",
11957    ) -> None:
11958        """
11959        The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates
11960        fields to rename and table if provided, and then calls another function to rename the fields.
11961
11962        :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be
11963        renamed in a table. Each key-value pair in the dictionary represents the original field name as
11964        the key and the new field name as the value
11965        :type fields_to_rename: dict
11966        :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to
11967        specify the name of the table for which the fields are to be renamed. It is a string type
11968        parameter
11969        :type table: str
11970        :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields`
11971        method is a string that specifies the name of the operation being performed. In this context, it
11972        is used as a default value for the operation name if not explicitly provided when calling the
11973        function, defaults to RENAME_INFO_FIELDS
11974        :type operation_name: str (optional)
11975        """
11976
11977        # Param
11978        param = self.get_param()
11979
11980        # Get param fields to rename
11981        param_fields_to_rename = (
11982            param.get("calculation", {})
11983            .get("calculations", {})
11984            .get(operation_name, {})
11985            .get("fields_to_rename", None)
11986        )
11987
11988        # Get param table
11989        param_table = (
11990            param.get("calculation", {})
11991            .get("calculations", {})
11992            .get(operation_name, {})
11993            .get("table", None)
11994        )
11995
11996        # Init fields_to_rename
11997        if fields_to_rename is None:
11998            fields_to_rename = param_fields_to_rename
11999
12000        # Init table
12001        if table is None:
12002            table = param_table
12003
12004        renamed_fields = self.rename_info_fields(
12005            fields_to_rename=fields_to_rename, table=table
12006        )
12007
12008        log.debug(f"renamed_fields:{renamed_fields}")

The calculation_rename_info_fields function retrieves parameters from a dictionary, updates fields to rename and table if provided, and then calls another function to rename the fields.

Parameters
  • fields_to_rename: fields_to_rename is a dictionary that contains the fields to be renamed in a table. Each key-value pair in the dictionary represents the original field name as the key and the new field name as the value
  • table: The table parameter in the calculation_rename_info_fields method is used to specify the name of the table for which the fields are to be renamed. It is a string type parameter
  • operation_name: The operation_name parameter in the calculation_rename_info_fields method is a string that specifies the name of the operation being performed. In this context, it is used as a default value for the operation name if not explicitly provided when calling the function, defaults to RENAME_INFO_FIELDS